├── .github
    └── workflows
    │   └── testing.yml
├── .gitignore
├── CHANGES.txt
├── LICENSE
├── LICENSES
    ├── CLOUDPICKLE_LICENSE
    ├── JOBLIB_LICENSE
    ├── PANDAS_LICENSE
    ├── PYPARSING_LICENSE
    ├── PYPRIND_LICENSE
    ├── PY_STRINGMATCHING_LICENSE
    ├── PY_STRINGSIMJOIN_LICENSE
    ├── SCIKIT_LEARN_LICENSE
    └── SIX_LICENSE
├── MANIFEST.in
├── README.rst
├── asv.conf.json
├── benchmarks
    ├── __init__.py
    ├── benchmark_attr_equiv_blocker.py
    ├── benchmark_blackbox_blocker.py
    ├── benchmark_debugblocker.py
    ├── benchmark_down_sample_sampler.py
    ├── benchmark_overlap_blocker.py
    ├── benchmark_rule_based_blocker.py
    └── benchmark_sn_blocker.py
├── build.bat
├── build.sh
├── build_tools
    ├── appveyor
    │   ├── install.ps1
    │   ├── rm_rf.py
    │   └── run_with_env.cmd
    ├── cythonize.py
    ├── move-conda-package.py
    └── requirements_dev.txt
├── conda.recipe
    ├── build.bat
    ├── build.sh
    └── meta.yaml
├── continuous-integration
    └── appveyor
    │   ├── install.ps1
    │   ├── rm_rf.py
    │   └── run_with_env.cmd
├── cythonize.dat
├── docs
    ├── Makefile
    ├── conf.py
    ├── conf_copy.py
    ├── contributing.rst
    ├── how_to_guide.rst
    ├── index.rst
    ├── make.bat
    ├── make_copy.sh
    ├── themes
    │   └── nature_with_gtoc
    │   │   ├── layout.html
    │   │   ├── static
    │   │       └── nature.css_t
    │   │   └── theme.conf
    └── user_manual
    │   ├── api
    │       ├── actual_commands.rst
    │       ├── adding_features.rst
    │       ├── blocking.rst
    │       ├── combiner.rst
    │       ├── combining_blocker_outputs.rst
    │       ├── creating_the_features_automatically.rst
    │       ├── creating_the_features_manually.rst
    │       ├── dask_commands.rst
    │       ├── data_exploration.rst
    │       ├── debugging_blocker_output.rst
    │       ├── debugging_matcher.rst
    │       ├── downsampling.rst
    │       ├── evaluating_the_matching_output.rst
    │       ├── extracting_feature_vectors.rst
    │       ├── handling_features.rst
    │       ├── handling_metadata.rst
    │       ├── imputing_missing_values.rst
    │       ├── labeling.rst
    │       ├── loading_and_saving_objects.rst
    │       ├── matching.rst
    │       ├── overview.rst
    │       ├── reading_and_writing_data.rst
    │       ├── sampling.rst
    │       ├── selecting_matcher.rst
    │       ├── splitting_data_into_train_test.rst
    │       ├── supported_matchers.rst
    │       ├── supported_similarity_functions.rst
    │       ├── supported_tokenizers.rst
    │       ├── triggers.rst
    │       └── tuners.rst
    │   ├── blocker_hierarchy.png
    │   ├── blocking.rst
    │   ├── create_feats_for_blocking.rst
    │   ├── create_feats_for_matching.rst
    │   ├── data_exploration.rst
    │   ├── datastructures.rst
    │   ├── debugging_blocking.rst
    │   ├── debugging_matcher.rst
    │   ├── down_sampling.rst
    │   ├── evaluate_matching.rst
    │   ├── example-blocking-matching.png
    │   ├── example-dev-stage.png
    │   ├── example-match-two-tables.png
    │   ├── example-prod-stage.png
    │   ├── example-workflow.png
    │   ├── extract_feat_vecs.rst
    │   ├── guides.rst
    │   ├── imputing_missing_values.rst
    │   ├── installation.rst
    │   ├── labeling.rst
    │   ├── matchercombiner.rst
    │   ├── matching.rst
    │   ├── misc.rst
    │   ├── overview.rst
    │   ├── profiling.rst
    │   ├── read_csv_files.rst
    │   ├── sampling.rst
    │   ├── select_best_matcher.rst
    │   ├── split_train_test.rst
    │   ├── steps_supp_em_workflows.rst
    │   ├── triggers.rst
    │   └── whatisnew.rst
├── notebooks
    ├── .ipynb_checkpoints
    │   ├── Untitled-checkpoint.ipynb
    │   ├── test_addfeature_py2-checkpoint.ipynb
    │   ├── test_attr_equiv_blocker-checkpoint.ipynb
    │   ├── test_autogenfeature_py3-checkpoint.ipynb
    │   ├── test_blackboxfunction-checkpoint.ipynb
    │   ├── test_blocker_combiner-checkpoint.ipynb
    │   ├── test_catalog-checkpoint.ipynb
    │   ├── test_combine_ids-checkpoint.ipynb
    │   ├── test_debug_matcher-checkpoint.ipynb
    │   ├── test_evaluation-checkpoint.ipynb
    │   ├── test_feature-checkpoint.ipynb
    │   ├── test_feature_add_features-checkpoint.ipynb
    │   ├── test_feature_attributeutils-checkpoint.ipynb
    │   ├── test_feature_extract_featurevecs-checkpoint.ipynb
    │   ├── test_feature_parse_string-checkpoint.ipynb
    │   ├── test_io-checkpoint.ipynb
    │   ├── test_kitchen-checkpoint.ipynb
    │   ├── test_labeling-checkpoint.ipynb
    │   ├── test_load_save-checkpoint.ipynb
    │   ├── test_overlapblocker-checkpoint.ipynb
    │   ├── test_projection-checkpoint.ipynb
    │   ├── test_rulebased_blocker-checkpoint.ipynb
    │   ├── test_sampling-checkpoint.ipynb
    │   └── test_trtst_split-checkpoint.ipynb
    ├── guides
    │   ├── .ipynb_checkpoints
    │   │   ├── Adding Features to Feature Table-checkpoint.ipynb
    │   │   ├── Combining Multiple Blockers-checkpoint.ipynb
    │   │   ├── Debugging Blocker Output-checkpoint.ipynb
    │   │   ├── Down Sampling-checkpoint.ipynb
    │   │   ├── Editing and Generate Features for Blocking Manually-checkpoint.ipynb
    │   │   ├── Evaluating the Selected Matcher-checkpoint.ipynb
    │   │   ├── Generating Features for Blocking Manually-checkpoint.ipynb
    │   │   ├── Performing Blocking Using Blackbox Blocker-checkpoint.ipynb
    │   │   ├── Performing Blocking Using Built-In Blockers (Attr. Equivalence Blocker)-checkpoint.ipynb
    │   │   ├── Performing Blocking Using Built-In Blockers (Overlap Blocker)-checkpoint.ipynb
    │   │   ├── Performing Blocking Using Rule-Based Blocking-checkpoint.ipynb
    │   │   ├── Reading CSV Files from Disk-checkpoint.ipynb
    │   │   ├── Reading the CSV Files from Disk-checkpoint.ipynb
    │   │   ├── Removing Features From Feature Table-checkpoint.ipynb
    │   │   ├── Sampling and Labeling-checkpoint.ipynb
    │   │   └── Selecting the Best Learning Matcher-checkpoint.ipynb
    │   ├── end_to_end_em_guides
    │   │   ├── .ipynb_checkpoints
    │   │   │   ├── Basic EM Workflow DBLP ACM-checkpoint.ipynb
    │   │   │   ├── Basic EM Workflow Restaurants - 1-checkpoint.ipynb
    │   │   │   ├── Basic EM Workflow Restaurants - 2-checkpoint.ipynb
    │   │   │   ├── Basic EM Workflow Restaurants - 3-checkpoint.ipynb
    │   │   │   └── Basic EM Workflow-checkpoint.ipynb
    │   │   ├── Basic EM Workflow DBLP ACM.ipynb
    │   │   ├── Basic EM Workflow Restaurants - 1.html
    │   │   ├── Basic EM Workflow Restaurants - 1.ipynb
    │   │   ├── Basic EM Workflow Restaurants - 2.html
    │   │   ├── Basic EM Workflow Restaurants - 2.ipynb
    │   │   ├── Basic EM Workflow Restaurants - 3.html
    │   │   ├── Basic EM Workflow Restaurants - 3.ipynb
    │   │   ├── Basic EM Workflow.html
    │   │   ├── Basic EM Workflow.ipynb
    │   │   └── helper_functions.py
    │   └── step_wise_em_guides
    │   │   ├── .ipynb_checkpoints
    │   │       ├── Adding Features to Feature Table-checkpoint.ipynb
    │   │       ├── Combining Multiple Blockers-checkpoint.ipynb
    │   │       ├── Data Exploration-checkpoint.ipynb
    │   │       ├── Data Profiling-checkpoint.ipynb
    │   │       ├── Debugging Blocker Output-checkpoint.ipynb
    │   │       ├── Down Sampling-checkpoint.ipynb
    │   │       ├── Editing and Generating Features Manually-checkpoint.ipynb
    │   │       ├── Evaluating the Selected Matcher-checkpoint.ipynb
    │   │       ├── Generating Features Manually-checkpoint.ipynb
    │   │       ├── Performing Blocking Using Blackbox Blocker-checkpoint.ipynb
    │   │       ├── Performing Blocking Using Built-In Blockers (Attr. Equivalence Blocker)-checkpoint.ipynb
    │   │       ├── Performing Blocking Using Built-In Blockers (Overlap Blocker)-checkpoint.ipynb
    │   │       ├── Performing Blocking Using Built-In Blockers (Sorted Neighborhood Blocker)-checkpoint.ipynb
    │   │       ├── Performing Blocking Using Rule-Based Blocking-checkpoint.ipynb
    │   │       ├── Performing Matching Using a ML Matcher-checkpoint.ipynb
    │   │       ├── Performing Matching with a Rule-Based Matcher-checkpoint.ipynb
    │   │       ├── Reading CSV Files from Disk-checkpoint.ipynb
    │   │       ├── Removing Features From Feature Table-checkpoint.ipynb
    │   │       ├── Sampling and Labeling-checkpoint.ipynb
    │   │       ├── Selecting the Best Learning Matcher-checkpoint.ipynb
    │   │       └── Using Match Triggers to Improve Results-checkpoint.ipynb
    │   │   ├── Adding Features to Feature Table.ipynb
    │   │   ├── Combining Multiple Blockers.ipynb
    │   │   ├── Data Exploration.ipynb
    │   │   ├── Data Profiling.ipynb
    │   │   ├── Debugging Blocker Output.ipynb
    │   │   ├── Down Sampling.ipynb
    │   │   ├── Editing and Generating Features Manually.ipynb
    │   │   ├── Evaluating the Selected Matcher.ipynb
    │   │   ├── Generating Features Manually.ipynb
    │   │   ├── Performing Blocking Using Blackbox Blocker.ipynb
    │   │   ├── Performing Blocking Using Built-In Blockers (Attr. Equivalence Blocker).ipynb
    │   │   ├── Performing Blocking Using Built-In Blockers (Overlap Blocker).ipynb
    │   │   ├── Performing Blocking Using Built-In Blockers (Sorted Neighborhood Blocker).ipynb
    │   │   ├── Performing Blocking Using Rule-Based Blocking.ipynb
    │   │   ├── Performing Matching Using a ML Matcher.ipynb
    │   │   ├── Performing Matching with a Rule-Based Matcher.ipynb
    │   │   ├── Reading CSV Files from Disk.ipynb
    │   │   ├── Removing Features From Feature Table.ipynb
    │   │   ├── Sampling and Labeling.ipynb
    │   │   ├── Selecting the Best Learning Matcher.ipynb
    │   │   └── Using Match Triggers to Improve Results.ipynb
    └── vldb_demo
    │   ├── .ipynb_checkpoints
    │       ├── Demo_notebook_v6-checkpoint.ipynb
    │       └── demo-checkpoint.ipynb
    │   ├── README
    │   ├── acm_demo.csv
    │   ├── dblp_demo.csv
    │   ├── demo.ipynb
    │   ├── labeled_data_demo.csv
    │   └── profiler.py
├── py_entitymatching
    ├── __init__.py
    ├── blocker
    │   ├── __init__.py
    │   ├── attr_equiv_blocker.py
    │   ├── black_box_blocker.py
    │   ├── blocker.py
    │   ├── overlap_blocker.py
    │   ├── rule_based_blocker.py
    │   └── sn_blocker.py
    ├── blockercombiner
    │   ├── __init__.py
    │   └── blockercombiner.py
    ├── catalog
    │   ├── __init__.py
    │   ├── catalog.py
    │   └── catalog_manager.py
    ├── dask
    │   ├── __init__.py
    │   ├── dask_attr_equiv_blocker.py
    │   ├── dask_black_box_blocker.py
    │   ├── dask_down_sample.py
    │   ├── dask_dtmatcher.py
    │   ├── dask_extract_features.py
    │   ├── dask_logregmatcher.py
    │   ├── dask_nbmatcher.py
    │   ├── dask_overlap_blocker.py
    │   ├── dask_rfmatcher.py
    │   ├── dask_rule_based_blocker.py
    │   ├── dask_svm_matcher.py
    │   ├── dask_xgboost_matcher.py
    │   ├── daskmlmatcher.py
    │   └── utils.py
    ├── datasets
    │   ├── ACM.csv
    │   ├── DBLP.csv
    │   ├── acm_demo.csv
    │   ├── acm_demo.metadata
    │   ├── dblp_acm_demo_labels.csv
    │   ├── dblp_demo.csv
    │   ├── end-to-end
    │   │   ├── Demo_notebook_v6.ipynb
    │   │   ├── acm_demo.csv
    │   │   ├── acm_demo.metadata
    │   │   ├── dblp_demo.csv
    │   │   ├── dblp_demo.metadata
    │   │   ├── labeled_data_demo.csv
    │   │   ├── profiler.py
    │   │   └── restaurants
    │   │   │   ├── fodors.csv
    │   │   │   ├── lbl_restnt_wf1.csv
    │   │   │   ├── lbl_restnt_wf1.metadata
    │   │   │   ├── match_fodors_zagats_more_attrs.csv
    │   │   │   ├── match_fodors_zagats_more_attrs.metadata
    │   │   │   ├── matches_fodors_zagats.csv
    │   │   │   └── zagats.csv
    │   ├── final_matches.csv
    │   ├── labeled_data_demo.csv
    │   ├── person_table_A.csv
    │   ├── person_table_A.metadata
    │   ├── person_table_B.csv
    │   ├── person_table_B.metadata
    │   └── tableC.csv
    ├── debugblocker
    │   ├── GenerateRecomLists.cpp
    │   ├── GenerateRecomLists.h
    │   ├── OriginalTopkPlain.cpp
    │   ├── PrefixEvent.cpp
    │   ├── PrefixEvent.h
    │   ├── TopPair.cpp
    │   ├── TopPair.h
    │   ├── TopkHeader.cpp
    │   ├── TopkHeader.h
    │   ├── __init__.py
    │   ├── backup_debugblocker.py
    │   ├── debugblocker.py
    │   ├── debugblocker_cython.cpp
    │   └── debugblocker_cython.pyx
    ├── debugmatcher
    │   ├── __init__.py
    │   ├── debug_decisiontree_matcher.py
    │   ├── debug_gui_decisiontree_matcher.py
    │   ├── debug_gui_randomforest_matcher.py
    │   ├── debug_gui_utils.py
    │   └── debug_randomforest_matcher.py
    ├── evaluation
    │   ├── __init__.py
    │   └── evaluation.py
    ├── experimental
    │   └── __init__.py
    ├── explorer
    │   ├── __init__.py
    │   ├── openrefine
    │   │   ├── __init__.py
    │   │   └── openrefine_wrapper.py
    │   └── pandastable
    │   │   ├── __init__.py
    │   │   └── pandastable_wrapper.py
    ├── feature
    │   ├── __init__.py
    │   ├── addfeatures.py
    │   ├── attributeutils.py
    │   ├── autofeaturegen.py
    │   ├── extractfeatures.py
    │   ├── simfunctions.py
    │   └── tokenizers.py
    ├── gui
    │   ├── __init__.py
    │   ├── debug_gui_base.py
    │   ├── gui_utils.py
    │   └── table_gui.py
    ├── io
    │   ├── __init__.py
    │   ├── parsers.py
    │   └── pickles.py
    ├── labeler
    │   ├── __init__.py
    │   └── labeler.py
    ├── matcher
    │   ├── __init__.py
    │   ├── booleanrulematcher.py
    │   ├── dtmatcher.py
    │   ├── ensemblematcher.py
    │   ├── linregmatcher.py
    │   ├── logregmatcher.py
    │   ├── matcher.py
    │   ├── matcherutils.py
    │   ├── mlmatcher.py
    │   ├── nbmatcher.py
    │   ├── rfmatcher.py
    │   ├── rulematcher.py
    │   ├── svmmatcher.py
    │   └── xgboostmatcher.py
    ├── matchercombiner
    │   ├── __init__.py
    │   └── matchercombiner.py
    ├── matcherselector
    │   ├── __init__.py
    │   ├── mlmatchercombinerselection.py
    │   └── mlmatcherselection.py
    ├── sampler
    │   ├── __init__.py
    │   ├── down_sample.py
    │   └── single_table.py
    ├── tests
    │   ├── __init__.py
    │   ├── _test_debug_matcher_dt.py
    │   ├── _test_debug_matcher_rf.py
    │   ├── _test_matcherselector_mlmatcherselection_xg.py
    │   ├── test_attr_equiv_blocker.py
    │   ├── test_black_box_blocker.py
    │   ├── test_blockercombiner.py
    │   ├── test_catalog.py
    │   ├── test_datasets
    │   │   ├── A.csv
    │   │   ├── A.metadata
    │   │   ├── B.csv
    │   │   ├── C.csv
    │   │   ├── C.metadata
    │   │   ├── C1.csv
    │   │   ├── C1.metadata
    │   │   ├── D.csv
    │   │   ├── D.metadata
    │   │   ├── blocker
    │   │   │   ├── table_A_wi_missing_vals.csv
    │   │   │   ├── table_A_wi_missing_vals.metadata
    │   │   │   ├── table_B_wi_missing_vals.csv
    │   │   │   └── table_B_wi_missing_vals.metadata
    │   │   ├── blockercombiner
    │   │   │   ├── C1.csv
    │   │   │   ├── C1.metadata
    │   │   │   ├── C1_ex_1.csv
    │   │   │   ├── C1_ex_1.metadata
    │   │   │   ├── C2.csv
    │   │   │   ├── C2.metadata
    │   │   │   ├── C2_ex_1.csv
    │   │   │   ├── C2_ex_1.metadata
    │   │   │   ├── C3.csv
    │   │   │   ├── C3.metadata
    │   │   │   ├── C3_ex_2.csv
    │   │   │   ├── C3_ex_2.metadata
    │   │   │   ├── C4_ex_1.csv
    │   │   │   ├── C4_ex_1.metadata
    │   │   │   ├── C4_ex_2.csv
    │   │   │   ├── C4_ex_2.metadata
    │   │   │   ├── C_ex_1.csv
    │   │   │   ├── C_ex_1.metadata
    │   │   │   ├── C_ex_2.csv
    │   │   │   ├── C_ex_2.metadata
    │   │   │   ├── C_ex_4.csv
    │   │   │   └── C_ex_4.metadata
    │   │   ├── catalog
    │   │   │   ├── A.metadata
    │   │   │   ├── A_dupid.csv
    │   │   │   ├── A_inv_fk.csv
    │   │   │   └── A_mvals.csv
    │   │   ├── debugblocker
    │   │   │   ├── test_debugblocker_13.metadata
    │   │   │   ├── test_debugblocker_13_out.csv
    │   │   │   ├── test_debugblocker_cand.csv
    │   │   │   ├── test_debugblocker_ltable.csv
    │   │   │   ├── test_debugblocker_rtable.csv
    │   │   │   ├── test_get_tokenized_table_1.txt
    │   │   │   ├── test_get_tokenized_table_2.txt
    │   │   │   ├── test_topk_sim_join_1_A.txt
    │   │   │   ├── test_topk_sim_join_1_B.txt
    │   │   │   └── test_topk_sim_join_1_C.txt
    │   │   ├── io
    │   │   │   ├── A.csv
    │   │   │   ├── A.mdx
    │   │   │   ├── A_dupid.csv
    │   │   │   ├── A_key_zipcode.csv
    │   │   │   ├── A_key_zipcode.metadata
    │   │   │   ├── A_md_wrongformat.csv
    │   │   │   ├── A_md_wrongformat.metadata
    │   │   │   ├── A_mvals.csv
    │   │   │   ├── C_partialmeta.csv
    │   │   │   ├── C_partialmeta.metadata
    │   │   │   ├── InvalidMetadata1.csv
    │   │   │   ├── InvalidMetadata1.metadata
    │   │   │   ├── InvalidMetadata2.csv
    │   │   │   ├── InvalidMetadata2.metadata
    │   │   │   ├── expected_A.metadata
    │   │   │   └── expected_C.metadata
    │   │   ├── matcherselector
    │   │   │   ├── ACM_demo.csv
    │   │   │   ├── DBLP_demo.csv
    │   │   │   ├── feat_vecs.csv
    │   │   │   └── feat_vecs.metadata
    │   │   ├── restA.csv
    │   │   ├── restB.csv
    │   │   └── sandbox
    │   │   │   ├── A.pkl
    │   │   │   └── A.pklmetadata
    │   ├── test_debugblocker.py
    │   ├── test_evaluation.py
    │   ├── test_feature_addfeatures.py
    │   ├── test_feature_attributeutils.py
    │   ├── test_feature_autofeaturegen.py
    │   ├── test_feature_extractfeaturevecs.py
    │   ├── test_feature_simfunctions.py
    │   ├── test_feature_tokenizers.py
    │   ├── test_io_import_export.py
    │   ├── test_io_load_save.py
    │   ├── test_labeler.py
    │   ├── test_match_trigger.py
    │   ├── test_matcher_ml_matcher.py
    │   ├── test_matcherselector_mlmatcherselection.py
    │   ├── test_overlap_blocker.py
    │   ├── test_rule_based_blocker.py
    │   ├── test_rule_based_matcher.py
    │   ├── test_sampler_down_sample.py
    │   ├── test_sampler_single_table.py
    │   ├── test_validation_helper.py
    │   └── utils.py
    ├── triggers
    │   ├── __init__.py
    │   └── matchtrigger.py
    ├── tuner
    │   ├── __init__.py
    │   ├── tuner_down_sample.py
    │   └── tuner_overlap_blocker.py
    └── utils
    │   ├── __init__.py
    │   ├── catalog_helper.py
    │   ├── generic_helper.py
    │   ├── pandas_helper.py
    │   ├── stop_words.txt
    │   └── validation_helper.py
├── requirements.txt
├── requirements.yml
└── setup.py


/.github/workflows/testing.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies and run with a variety of Python versions
 2 | 
 3 | name: Python package
 4 | 
 5 | on:
 6 |   - push
 7 |   - pull_request
 8 | 
 9 | jobs:
10 |   build:
11 | 
12 |     strategy:
13 |       fail-fast: false
14 |       matrix:
15 |         python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"]
16 |         os: ["ubuntu-latest", "windows-latest", "macos-latest"]
17 |     runs-on: ${{ matrix.os }}
18 |     env:
19 |       GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
20 | 
21 |     steps:
22 |     - uses: actions/checkout@v3
23 |     - name: Set up Python ${{ matrix.python-version }}
24 |       uses: actions/setup-python@v3
25 |       with:
26 |         python-version: ${{ matrix.python-version }}
27 |     - name: Install dependencies
28 |       run: |
29 |         python -m pip install --upgrade pip
30 |         pip install numpy PyQt5 Cython
31 |         pip install -r requirements.txt
32 |     - name: Install package
33 |       run: |
34 |         python setup.py build_ext --inplace
35 |     - name: Run tests
36 |       run: |
37 |         python -m unittest -v
38 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | ### Python template
 2 | # Byte-compiled / optimized / DLL files
 3 | __pycache__/
 4 | *.py[cod]
 5 | *$py.class
 6 | 
 7 | # C extensions
 8 | *.so
 9 | 
10 | # Distribution / packaging
11 | .Python
12 | env/
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | .eggs/
19 | lib/
20 | lib64/
21 | parts/
22 | sdist/
23 | var/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | 
28 | # PyInstaller
29 | #  Usually these files are written by a python script from a template
30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 | 
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 | 
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *,cover
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | 
55 | # Sphinx documentation
56 | docs/_build/
57 | 
58 | # PyBuilder
59 | target/
60 | 
61 | # temp dir
62 | scratch/
63 | 
64 | # idea files
65 | .idea/
66 | # Created by .ignore support plugin (hsz.mobi)
67 | 
68 | py_entitymatching/datasets/msd_reduced.csv
69 | py_entitymatching/tests/test_datasets/sandbox/*
70 | py_entitymatching/datasets/example_datasets
71 | cover/
72 | results/
73 | html/
74 | 
75 | *.dot
76 | *.png
77 | *.pkl
78 | *.swp
79 | 
80 | notebooks/how-to-guides/bdata/
81 | garage/
82 | 
83 | .DS_Store
84 | 


--------------------------------------------------------------------------------
/CHANGES.txt:
--------------------------------------------------------------------------------
 1 | v0.4.2 - 2/7/2024
 2 |   * Fixed various errors in setup.py when pip installing
 3 |   * Adjusted setuptools.setup project name to match name on PyPI
 4 | 
 5 | v0.4.1 - 3/18/2023
 6 |   * Dropped support for Python 3.6.
 7 |   * Added support for Python 3.10 and 3.11.
 8 |   * Switched from Nose to vanilla Unittest.
 9 |   * Replaced Travis and Appveyor CI testing with Github Actions.
10 | 
11 | v0.4.0 - 11/20/2020
12 |   * Dropped support for Python 2 and 3.5.
13 |   * To support Python 3.8, updated the function
14 |     py_entitymatching.matcher.matcherutils.impute_table() to use current scikit-learn's
15 |     SimpleImputer; see issue #127.
16 | 
17 | v0.3.3 - 10/19/2020
18 |   * Started tracking release changes in CHANGES.txt.
19 |   * Minorly updated usage of Pandas to prevent a dependency on downgraded versions.
20 |   * Added stricter scikit-learn dependency requirements to preserve old Imputer API; will change in future release.
21 |   * Users can now provide metadata for blackbox features (ex. left_attribute, right_attribute, etc.).
22 |   * This is the last version of py_entitymatching that will support Python 2 and Python 3.5.
23 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2016, anhaidgroup
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | * Redistributions of source code must retain the above copyright notice, this
 8 |   list of conditions and the following disclaimer.
 9 | 
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 |   this list of conditions and the following disclaimer in the documentation
12 |   and/or other materials provided with the distribution.
13 | 
14 | * Neither the name of py_entitymatching nor the names of its
15 |   contributors may be used to endorse or promote products derived from
16 |   this software without specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 


--------------------------------------------------------------------------------
/LICENSES/CLOUDPICKLE_LICENSE:
--------------------------------------------------------------------------------
 1 | This module was extracted from the `cloud` package, developed by
 2 | PiCloud, Inc.
 3 | 
 4 | Copyright (c) 2015, Cloudpickle contributors.
 5 | Copyright (c) 2012, Regents of the University of California.
 6 | Copyright (c) 2009 PiCloud, Inc. http://www.picloud.com.
 7 | All rights reserved.
 8 | 
 9 | Redistribution and use in source and binary forms, with or without
10 | modification, are permitted provided that the following conditions
11 | are met:
12 |     * Redistributions of source code must retain the above copyright
13 |       notice, this list of conditions and the following disclaimer.
14 |     * Redistributions in binary form must reproduce the above copyright
15 |       notice, this list of conditions and the following disclaimer in the
16 |       documentation and/or other materials provided with the distribution.
17 |     * Neither the name of the University of California, Berkeley nor the
18 |       names of its contributors may be used to endorse or promote
19 |       products derived from this software without specific prior written
20 |       permission.
21 | 
22 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
25 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
26 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
28 | TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
29 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
30 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
31 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
32 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 | 
34 | 


--------------------------------------------------------------------------------
/LICENSES/JOBLIB_LICENSE:
--------------------------------------------------------------------------------
 1 | joblib is BSD-licenced (3 clause):
 2 | 
 3 | This software is OSI Certified Open Source Software.
 4 | OSI Certified is a certification mark of the Open Source Initiative.
 5 | 
 6 | Copyright (c) 2009-2011, joblib developpers
 7 | All rights reserved.
 8 | 
 9 | Redistribution and use in source and binary forms, with or without
10 | modification, are permitted provided that the following conditions are met:
11 | 
12 | * Redistributions of source code must retain the above copyright notice,
13 |   this list of conditions and the following disclaimer.
14 | 
15 | * Redistributions in binary form must reproduce the above copyright notice,
16 |   this list of conditions and the following disclaimer in the documentation
17 |   and/or other materials provided with the distribution.
18 | 
19 | * Neither the name of Gael Varoquaux. nor the names of other joblib
20 |   contributors may be used to endorse or promote products derived from
21 |   this software without specific prior written permission.
22 | 
23 | This software is provided by the copyright holders and contributors
24 | "as is" and any express or implied warranties, including, but not
25 | limited to, the implied warranties of merchantability and fitness for
26 | a particular purpose are disclaimed. In no event shall the copyright
27 | owner or contributors be liable for any direct, indirect, incidental,
28 | special, exemplary, or consequential damages (including, but not
29 | limited to, procurement of substitute goods or services; loss of use,
30 | data, or profits; or business interruption) however caused and on any
31 | theory of liability, whether in contract, strict liability, or tort
32 | (including negligence or otherwise) arising in any way out of the use
33 | of this software, even if advised of the possibility of such
34 | damage.
35 | 


--------------------------------------------------------------------------------
/LICENSES/PYPARSING_LICENSE:
--------------------------------------------------------------------------------
 1 | Permission is hereby granted, free of charge, to any person obtaining
 2 | a copy of this software and associated documentation files (the
 3 | "Software"), to deal in the Software without restriction, including
 4 | without limitation the rights to use, copy, modify, merge, publish,
 5 | distribute, sublicense, and/or sell copies of the Software, and to
 6 | permit persons to whom the Software is furnished to do so, subject to
 7 | the following conditions:
 8 | 
 9 | The above copyright notice and this permission notice shall be
10 | included in all copies or substantial portions of the Software.
11 | 
12 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
13 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
14 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
15 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
16 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
17 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
18 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
19 | 


--------------------------------------------------------------------------------
/LICENSES/PYPRIND_LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2014-2016, Sebastian Raschka
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | * Redistributions of source code must retain the above copyright notice, this
 8 |   list of conditions and the following disclaimer.
 9 | 
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 |   this list of conditions and the following disclaimer in the documentation
12 |   and/or other materials provided with the distribution.
13 | 
14 | * Neither the name of biopandas nor the names of its
15 |   contributors may be used to endorse or promote products derived from
16 |   this software without specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 
29 | 


--------------------------------------------------------------------------------
/LICENSES/PY_STRINGMATCHING_LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2016, anhaidgroup
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | * Redistributions of source code must retain the above copyright notice, this
 8 |   list of conditions and the following disclaimer.
 9 | 
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 |   this list of conditions and the following disclaimer in the documentation
12 |   and/or other materials provided with the distribution.
13 | 
14 | * Neither the name of py_stringmatching nor the names of its
15 |   contributors may be used to endorse or promote products derived from
16 |   this software without specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 


--------------------------------------------------------------------------------
/LICENSES/PY_STRINGSIMJOIN_LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2016, anhaidgroup
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | * Redistributions of source code must retain the above copyright notice, this
 8 |   list of conditions and the following disclaimer.
 9 | 
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 |   this list of conditions and the following disclaimer in the documentation
12 |   and/or other materials provided with the distribution.
13 | 
14 | * Neither the name of py_stringsimjoin nor the names of its
15 |   contributors may be used to endorse or promote products derived from
16 |   this software without specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 


--------------------------------------------------------------------------------
/LICENSES/SCIKIT_LEARN_LICENSE:
--------------------------------------------------------------------------------
 1 | New BSD License
 2 | 
 3 | Copyright (c) 2007–2016 The scikit-learn developers.
 4 | All rights reserved.
 5 | 
 6 | 
 7 | Redistribution and use in source and binary forms, with or without
 8 | modification, are permitted provided that the following conditions are met:
 9 | 
10 |   a. Redistributions of source code must retain the above copyright notice,
11 |      this list of conditions and the following disclaimer.
12 |   b. Redistributions in binary form must reproduce the above copyright
13 |      notice, this list of conditions and the following disclaimer in the
14 |      documentation and/or other materials provided with the distribution.
15 |   c. Neither the name of the Scikit-learn Developers  nor the names of
16 |      its contributors may be used to endorse or promote products
17 |      derived from this software without specific prior written
18 |      permission. 
19 | 
20 | 
21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 | ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR
25 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 | LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 | OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
31 | DAMAGE.
32 | 


--------------------------------------------------------------------------------
/LICENSES/SIX_LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2010-2016 Benjamin Peterson
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 4 | this software and associated documentation files (the "Software"), to deal in
 5 | the Software without restriction, including without limitation the rights to
 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 7 | the Software, and to permit persons to whom the Software is furnished to do so,
 8 | subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
15 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
16 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
17 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
18 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
19 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include MANIFEST.in
 2 | include README.rst
 3 | include setup.py
 4 | include py_entitymatching/datasets/*.csv
 5 | recursive-include py_entitymatching/tests *.csv
 6 | 
 7 | 
 8 | graft py_entitymatching
 9 | 
10 | global-exclude *.so
11 | global-exclude *.pyd
12 | global-exclude *.pyc
13 | global-exclude *~
14 | global-exclude \#*
15 | global-exclude .git*
16 | global-exclude .DS_Store
17 | global-exclude *.png
18 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | py_entitymatching
 2 | =================
 3 | 
 4 | This project seeks to build a Python software package to match entities
 5 | between two tables using supervised learning. This problem is often
 6 | referred as entity matching (EM). Given two tables A and B, the goal of
 7 | EM is to discover the tuple pairs between two tables that refer to the
 8 | same real-world entities. There are two main steps involved in entity matching:
 9 | blocking and matching. The blocking step aims to remove obvious non-matching
10 | tuple pairs and reduce the set considered for matching. Entity matching in
11 | practice involves many steps than just blocking and matching. While performing EM
12 | users often execute many steps, e.g. exploring, cleaning, debugging, sampling,
13 | estimating accuracy, etc. Current EM systems however do not cover the entire
14 | EM pipeline, providing support only for a few steps (e.g., blocking, matching), while
15 | ignoring less well-known yet equally critical steps (e.g., debgging, sampling).
16 | This package seeks to support all the steps involved in EM pipeline.
17 | 
18 | The package is free, open-source, and BSD-licensed.
19 | 
20 | Important links
21 | ===============
22 | 
23 | * Project Homepage: https://sites.google.com/site/anhaidgroup/projects/magellan/py_entitymatching
24 | * Code repository: https://github.com/anhaidgroup/py_entitymatching
25 | * User Manual: http://anhaidgroup.github.io/py_entitymatching/v0.4.0/index.html
26 | * Guides: http://anhaidgroup.github.io/py_entitymatching/v0.4.0/user_manual/guides.html
27 | * How to Contribute: http://anhaidgroup.github.io/py_entitymatching/v0.4.0/contributing.html
28 | * Issue Tracker: https://github.com/anhaidgroup/py_entitymatching/issues
29 | 
30 | Dependencies
31 | ============
32 | 
33 | The required dependencies to build the packages are:
34 | 
35 | * numpy 1.7.0 or higher. Tested on version 1.19.4.
36 | * pandas (provides data structures to store and manage tables). Tested on version 1.1.4.
37 | * scikit-learn 0.22 or higher (provides implementations for common machine learning algorithms). Tested on version 0.23.2.
38 | * joblib (provides multiprocessing capabilities). Tested on version 0.17.0.
39 | * py_stringsimjoin (provides implementations for string similarity joins). Tested on version 0.3.2.
40 | * py_stringmatching (provides a set of string tokenizers and string similarity functions). Tested on version 0.4.2.
41 | * cloudpickle (provides functions to serialize Python constructs). Tested on version 1.6.0.
42 | * pyprind (library to display progress indicators). Tested on version 2.9.8.
43 | * pyparsing (library to parse strings). Tested on version 2.4.7.
44 | * six (provides functions to write compatible code across Python 2 and 3). Tested on version 1.15.0.
45 | 
46 | Platforms
47 | =========
48 | 
49 | py_entitymatching has been tested on Linux, OS X and Windows.
50 | 


--------------------------------------------------------------------------------
/asv.conf.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     // The version of the config file format.  Do not change, unless
 3 |     // you know what you are doing.
 4 |     "version": 1,
 5 | 
 6 |     // The name of the project being benchmarked
 7 |     "project": "magellan",
 8 | 
 9 |     // The project's homepage
10 |     //"project_url": "https://github.com/anhaidgroup/magellan/",
11 |     "project_url": "https://github.com/anhaidgroup/magellan/",
12 | 
13 |     // The URL or local path of the source code repository for the
14 |     // project being benchmarked
15 |     //"repo": "https://github.com/anhaidgroup/magellan.git",
16 |     "repo": ".",
17 | 
18 |     // List of branches to benchmark. If not provided, defaults to "master"
19 |     // (for git) or "tip" (for mercurial).
20 |     "branches": ["master"], // for git
21 |     // "branches": ["tip"],    // for mercurial
22 | 
23 |     // The DVCS being used.  If not set, it will be automatically
24 |     // determined from "repo" by looking at the protocol in the URL
25 |     // (if remote), or by looking for special directories, such as
26 |     // ".git" (if local).
27 |     // "dvcs": "git",
28 | 
29 |     // The tool to use to create environments.  May be "conda",
30 |     // "virtualenv" or other value depending on the plugins in use.
31 |     // If missing or the empty string, the tool will be automatically
32 |     // determined by looking for tools on the PATH environment
33 |     // variable.
34 |     "environment_type": "conda",
35 | 
36 |     // the base URL to show a commit for the project.
37 |     "show_commit_url": "https://github.com/anhaidgroup/magellan/commit/",
38 | 
39 |     // The Pythons you'd like to test against.  If not provided, defaults
40 |     // to the current version of Python used to run `asv`.
41 |     "pythons": ["2.7"],
42 |     //"pythons": ["2.7","3.3", "3.4", "3.5"],
43 | 
44 |     // The matrix of dependencies to test.  Each key is the name of a
45 |     // package (in PyPI) and the values are version numbers.  An empty
46 |     // list indicates to just test against the default (latest)
47 |     // version.
48 |      "matrix": {
49 |          "numpy":[],
50 |          "pyqt":[],
51 |          "scipy":[],
52 |          "pandas":[],
53 |          "pyparsing":[],
54 |          "six":[],
55 |          "scikit-learn":[],
56 |          "cloudpickle":[],
57 |          "joblib": [],
58 | 	 "pip+py_stringmatching": [],
59 |          "pip+pyprind": []
60 |      },
61 | 
62 |     // The directory (relative to the current directory) that benchmarks are
63 |     // stored in.  If not provided, defaults to "benchmarks"
64 |     "benchmark_dir": "benchmarks"
65 | 
66 |     // The directory (relative to the current directory) to cache the Python
67 |     // environments in.  If not provided, defaults to "env"
68 |     // "env_dir": "env",
69 | 
70 | 
71 |     // The directory (relative to the current directory) that raw benchmark
72 |     // results are stored in.  If not provided, defaults to "results".
73 |     // "results_dir": "results",
74 | 
75 |     // The directory (relative to the current directory) that the html tree
76 |     // should be written to.  If not provided, defaults to "html".
77 |     // "html_dir": "html",
78 | 
79 |     // The number of characters to retain in the commit hashes.
80 |     // "hash_length": 8,
81 | 
82 |     // `asv` will cache wheels of the recent builds in each
83 |     // environment, making them faster to install next time.  This is
84 |     // number of builds to keep, per environment.
85 |     // "wheel_cache_size": 0
86 | }
87 | 


--------------------------------------------------------------------------------
/benchmarks/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/build.bat:
--------------------------------------------------------------------------------
1 | "%PYTHON%" setup.py install --single-version-externally-managed --record=record.txt
2 | if errorlevel 1 exit 1
3 | 
4 | :: Add more build steps here, if they are necessary.
5 | 
6 | :: See
7 | :: http://docs.continuum.io/conda/build.html
8 | :: for a list of environment variables that are set during the build process.


--------------------------------------------------------------------------------
/build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | $PYTHON setup.py install --single-version-externally-managed --record=record.txt
 4 | 
 5 | # Add more build steps here, if they are necessary.
 6 | 
 7 | # See
 8 | # http://docs.continuum.io/conda/build.html
 9 | # for a list of environment variables that are set during the build process.
10 | 


--------------------------------------------------------------------------------
/build_tools/appveyor/install.ps1:
--------------------------------------------------------------------------------
 1 | # Sample script to install Miniconda under Windows
 2 | # Authors: Olivier Grisel, Jonathan Helmus and Kyle Kastner, Robert McGibbon
 3 | # License: CC0 1.0 Universal: http://creativecommons.org/publicdomain/zero/1.0/
 4 | 
 5 | $MINICONDA_URL = "http://repo.continuum.io/miniconda/"
 6 | 
 7 | 
 8 | function DownloadMiniconda ($python_version, $platform_suffix) {
 9 |     $webclient = New-Object System.Net.WebClient
10 |     $filename = "Miniconda3-latest-Windows-" + $platform_suffix + ".exe"
11 |    # $filename = "Miniconda3-3.8.3-Windows-" + $platform_suffix + ".exe"
12 |     $url = $MINICONDA_URL + $filename
13 | 
14 |     $basedir = $pwd.Path + "\"
15 |     $filepath = $basedir + $filename
16 |     if (Test-Path $filename) {
17 |         Write-Host "Reusing" $filepath
18 |         return $filepath
19 |     }
20 | 
21 |     # Download and retry up to 3 times in case of network transient errors.
22 |     Write-Host "Downloading" $filename "from" $url
23 |     $retry_attempts = 2
24 |     for($i=0; $i -lt $retry_attempts; $i++){
25 |         try {
26 |             $webclient.DownloadFile($url, $filepath)
27 |             break
28 |         }
29 |         Catch [Exception]{
30 |             Start-Sleep 1
31 |         }
32 |    }
33 |    if (Test-Path $filepath) {
34 |        Write-Host "File saved at" $filepath
35 |    } else {
36 |        # Retry once to get the error message if any at the last try
37 |        $webclient.DownloadFile($url, $filepath)
38 |    }
39 |    return $filepath
40 | }
41 | 
42 | 
43 | function InstallMiniconda ($python_version, $architecture, $python_home) {
44 |     Write-Host "Installing Python" $python_version "for" $architecture "bit architecture to" $python_home
45 |     if (Test-Path $python_home) {
46 |         Write-Host $python_home "already exists, skipping."
47 |         return $false
48 |     }
49 |     if ($architecture -match "32") {
50 |         $platform_suffix = "x86"
51 |     } else {
52 |         $platform_suffix = "x86_64"
53 |     }
54 | 
55 |     $filepath = DownloadMiniconda $python_version $platform_suffix
56 |     Write-Host "Installing" $filepath "to" $python_home
57 |     $install_log = $python_home + ".log"
58 |     $args = "/S /D=$python_home"
59 |     Write-Host $filepath $args
60 |     Start-Process -FilePath $filepath -ArgumentList $args -Wait -Passthru
61 |     if (Test-Path $python_home) {
62 |         Write-Host "Python $python_version ($architecture) installation complete"
63 |     } else {
64 |         Write-Host "Failed to install Python in $python_home"
65 |         Get-Content -Path $install_log
66 |         Exit 1
67 |     }
68 | }
69 | 
70 | 
71 | function InstallCondaPackages ($python_home, $spec) {
72 |     $conda_path = $python_home + "\Scripts\conda.exe"
73 |     $args = "install --yes " + $spec
74 |     Write-Host ("conda " + $args)
75 |     Start-Process -FilePath "$conda_path" -ArgumentList $args -Wait -Passthru
76 | }
77 | 
78 | function UpdateConda ($python_home) {
79 |     $conda_path = $python_home + "\Scripts\conda.exe"
80 |     Write-Host "Updating conda..."
81 |     $args = "update --yes conda"
82 |     Write-Host $conda_path $args
83 |     Start-Process -FilePath "$conda_path" -ArgumentList $args -Wait -Passthru
84 | }
85 | 
86 | 
87 | function main () {
88 |     InstallMiniconda $env:PYTHON_VERSION $env:PYTHON_ARCH $env:PYTHON
89 |     UpdateConda $env:PYTHON
90 |     InstallCondaPackages $env:PYTHON "conda-build jinja2 anaconda-client"
91 | }
92 | 
93 | main
94 | 


--------------------------------------------------------------------------------
/build_tools/appveyor/rm_rf.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import os
 3 | import sys
 4 | import stat
 5 | import shutil
 6 | 
 7 | def remove_readonly(func, path, excinfo):
 8 |     os.chmod(path, stat.S_IWRITE)
 9 |     func(path)
10 | 
11 | def main():
12 |     print(sys.executable)
13 |     try:
14 |         shutil.rmtree(sys.argv[1], onerror=remove_readonly)
15 |     except Exception as e:
16 |         print("Error")
17 |         print(e)
18 | 
19 | if __name__ == '__main__':
20 |     main()
21 | 
22 | 


--------------------------------------------------------------------------------
/build_tools/move-conda-package.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import yaml
 4 | import glob
 5 | import shutil
 6 | 
 7 | #try
 8 | #    from conda_build.config import config
 9 | #except ImportError:
10 | from conda_build.config import Config # 03/03/2017: Updated based on the changes to conda_build.config
11 | config = Config()
12 | 
13 | with open(os.path.join(sys.argv[1], 'meta.yaml')) as f:
14 |     name = yaml.load(f)['package']['name']
15 | 
16 | binary_package_glob = os.path.join(config.bldpkgs_dir, '{0}*.tar.bz2'.format(name))
17 | binary_package = glob.glob(binary_package_glob)[0]
18 | 
19 | shutil.move(binary_package, '.')
20 | 


--------------------------------------------------------------------------------
/build_tools/requirements_dev.txt:
--------------------------------------------------------------------------------
1 | numpy>=1.7.0
2 | six
3 | Cython
4 | nose
5 | 


--------------------------------------------------------------------------------
/conda.recipe/build.bat:
--------------------------------------------------------------------------------
1 | "%PYTHON%" setup.py install --single-version-externally-managed --record=record.txt
2 | if errorlevel 1 exit 1
3 | 
4 | :: Add more build steps here, if they are necessary.
5 | 
6 | :: See
7 | :: http://docs.continuum.io/conda/build.html
8 | :: for a list of environment variables that are set during the build process.
9 | 


--------------------------------------------------------------------------------
/conda.recipe/build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | $PYTHON setup.py install --single-version-externally-managed --record=record.txt
 4 | 
 5 | # Add more build steps here, if they are necessary.
 6 | 
 7 | # See
 8 | # http://docs.continuum.io/conda/build.html
 9 | # for a list of environment variables that are set during the build process.
10 | 


--------------------------------------------------------------------------------
/conda.recipe/meta.yaml:
--------------------------------------------------------------------------------
 1 | package:
 2 |   name: py_entitymatching
 3 |   version: "0.2.1"
 4 | 
 5 | source:
 6 |     git_url: ../
 7 | 
 8 | 
 9 | requirements:
10 |   build:
11 |     - python
12 |     - setuptools
13 |     - py_stringsimjoin
14 |     - cloudpickle
15 |     - pyparsing
16 |     - scikit-learn
17 |     - pyqt
18 |     - pandas-profiling
19 |     - requests
20 | #    - xgboost
21 | 
22 |   run:
23 |     - python
24 |     - py_stringsimjoin
25 |     - cloudpickle
26 |     - pyparsing
27 |     - scikit-learn
28 |     - pyqt
29 |     - pandas-profiling
30 |     - requests
31 | #    - xgboost
32 | 
33 | test:
34 |   # Python imports
35 |   imports:
36 |     - py_entitymatching
37 | 
38 |   # commands:
39 |     # You can put test commands to be run here.  Use this to test that the
40 |     # entry points work.
41 | 
42 | 
43 |   # You can also put a file called run_test.py in the recipe that will be run
44 |   # at test time.
45 | 
46 |   # requires:
47 |     # Put any additional test requirements here.  For example
48 |     # - nose
49 | 
50 | about:
51 |   home: https://sites.google.com/site/anhaidgroup/projects/magellan/py_entitymatching
52 |   license: BSD License
53 |   summary: 'Python library for entity matching.'
54 | 
55 | # See
56 | # http://docs.continuum.io/conda/build.html for
57 | # more information about meta.yaml
58 | 


--------------------------------------------------------------------------------
/continuous-integration/appveyor/install.ps1:
--------------------------------------------------------------------------------
 1 | # Sample script to install Miniconda under Windows
 2 | # Authors: Olivier Grisel, Jonathan Helmus and Kyle Kastner, Robert McGibbon
 3 | # License: CC0 1.0 Universal: http://creativecommons.org/publicdomain/zero/1.0/
 4 | 
 5 | $MINICONDA_URL = "http://repo.continuum.io/miniconda/"
 6 | 
 7 | 
 8 | function DownloadMiniconda ($python_version, $platform_suffix) {
 9 |     $webclient = New-Object System.Net.WebClient
10 |     if ($python_version -match "3.4") {
11 |         $filename = "Miniconda3-latest-Windows-" + $platform_suffix + ".exe"
12 |     } else {
13 |         $filename = "Miniconda3-latest-Windows-" + $platform_suffix + ".exe"
14 |     }
15 |     $url = $MINICONDA_URL + $filename
16 | 
17 |     $basedir = $pwd.Path + "\"
18 |     $filepath = $basedir + $filename
19 |     if (Test-Path $filename) {
20 |         Write-Host "Reusing" $filepath
21 |         return $filepath
22 |     }
23 | 
24 |     # Download and retry up to 3 times in case of network transient errors.
25 |     Write-Host "Downloading" $filename "from" $url
26 |     $retry_attempts = 2
27 |     for($i=0; $i -lt $retry_attempts; $i++){
28 |         try {
29 |             $webclient.DownloadFile($url, $filepath)
30 |             break
31 |         }
32 |         Catch [Exception]{
33 |             Start-Sleep 1
34 |         }
35 |    }
36 |    if (Test-Path $filepath) {
37 |        Write-Host "File saved at" $filepath
38 |    } else {
39 |        # Retry once to get the error message if any at the last try
40 |        $webclient.DownloadFile($url, $filepath)
41 |    }
42 |    return $filepath
43 | }
44 | 
45 | 
46 | function InstallMiniconda ($python_version, $architecture, $python_home) {
47 |     Write-Host "Installing Python" $python_version "for" $architecture "bit architecture to" $python_home
48 |     if (Test-Path $python_home) {
49 |         Write-Host $python_home "already exists, skipping."
50 |         return $false
51 |     }
52 |     if ($architecture -match "32") {
53 |         $platform_suffix = "x86"
54 |     } else {
55 |         $platform_suffix = "x86_64"
56 |     }
57 | 
58 |     $filepath = DownloadMiniconda $python_version $platform_suffix
59 |     Write-Host "Installing" $filepath "to" $python_home
60 |     $install_log = $python_home + ".log"
61 |     $args = "/S /D=$python_home"
62 |     Write-Host $filepath $args
63 |     Start-Process -FilePath $filepath -ArgumentList $args -Wait -Passthru
64 |     if (Test-Path $python_home) {
65 |         Write-Host "Python $python_version ($architecture) installation complete"
66 |     } else {
67 |         Write-Host "Failed to install Python in $python_home"
68 |         Get-Content -Path $install_log
69 |         Exit 1
70 |     }
71 | }
72 | 
73 | 
74 | function InstallCondaPackages ($python_home, $spec) {
75 |     $conda_path = $python_home + "\Scripts\conda.exe"
76 |     $args = "install --yes " + $spec
77 |     Write-Host ("conda " + $args)
78 |     Start-Process -FilePath "$conda_path" -ArgumentList $args -Wait -Passthru
79 | }
80 | 
81 | function UpdateConda ($python_home) {
82 |     $conda_path = $python_home + "\Scripts\conda.exe"
83 |     Write-Host "Updating conda..."
84 |     $args = "update --yes conda"
85 |     Write-Host $conda_path $args
86 |     Start-Process -FilePath "$conda_path" -ArgumentList $args -Wait -Passthru
87 | }
88 | 
89 | 
90 | function main () {
91 |     InstallMiniconda $env:PYTHON_VERSION $env:PYTHON_ARCH $env:PYTHON
92 |     UpdateConda $env:PYTHON
93 |     InstallCondaPackages $env:PYTHON "conda-build jinja2 anaconda-client"
94 | }
95 | 
96 | main
97 | 
98 | 


--------------------------------------------------------------------------------
/continuous-integration/appveyor/rm_rf.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import os
 3 | import sys
 4 | import stat
 5 | import shutil
 6 | 
 7 | def remove_readonly(func, path, excinfo):
 8 |     os.chmod(path, stat.S_IWRITE)
 9 |     func(path)
10 | 
11 | def main():
12 |     print(sys.executable)
13 |     try:
14 |         shutil.rmtree(sys.argv[1], onerror=remove_readonly)
15 |     except Exception as e:
16 |         print("Error")
17 |         print(e)
18 | 
19 | if __name__ == '__main__':
20 |     main()
21 | 
22 | 


--------------------------------------------------------------------------------
/continuous-integration/appveyor/run_with_env.cmd:
--------------------------------------------------------------------------------
 1 | :: To build extensions for 64 bit Python 3, we need to configure environment
 2 | :: variables to use the MSVC 2010 C++ compilers from GRMSDKX_EN_DVD.iso of:
 3 | :: MS Windows SDK for Windows 7 and .NET Framework 4 (SDK v7.1)
 4 | ::
 5 | :: To build extensions for 64 bit Python 2, we need to configure environment
 6 | :: variables to use the MSVC 2008 C++ compilers from GRMSDKX_EN_DVD.iso of:
 7 | :: MS Windows SDK for Windows 7 and .NET Framework 3.5 (SDK v7.0)
 8 | ::
 9 | :: 32 bit builds do not require specific environment configurations.
10 | ::
11 | :: Note: this script needs to be run with the /E:ON and /V:ON flags for the
12 | :: cmd interpreter, at least for (SDK v7.0)
13 | ::
14 | :: More details at:
15 | :: https://github.com/cython/cython/wiki/64BitCythonExtensionsOnWindows
16 | :: http://stackoverflow.com/a/13751649/163740
17 | ::
18 | :: Author: Olivier Grisel
19 | :: License: CC0 1.0 Universal: http://creativecommons.org/publicdomain/zero/1.0/
20 | @ECHO OFF
21 | 
22 | SET COMMAND_TO_RUN=%*
23 | SET WIN_SDK_ROOT=C:\Program Files\Microsoft SDKs\Windows
24 | 
25 | SET MAJOR_PYTHON_VERSION="%PYTHON_VERSION:~0,1%"
26 | IF %MAJOR_PYTHON_VERSION% == "2" (
27 |     SET WINDOWS_SDK_VERSION="v7.0"
28 | ) ELSE IF %MAJOR_PYTHON_VERSION% == "3" (
29 |     SET WINDOWS_SDK_VERSION="v7.1"
30 | ) ELSE (
31 |     ECHO Unsupported Python version: "%MAJOR_PYTHON_VERSION%"
32 |     EXIT 1
33 | )
34 | 
35 | IF "%PYTHON_ARCH%"=="64" (
36 |     ECHO Configuring Windows SDK %WINDOWS_SDK_VERSION% for Python %MAJOR_PYTHON_VERSION% on a 64 bit architecture
37 |     SET DISTUTILS_USE_SDK=1
38 |     SET MSSdk=1
39 |     "%WIN_SDK_ROOT%\%WINDOWS_SDK_VERSION%\Setup\WindowsSdkVer.exe" -q -version:%WINDOWS_SDK_VERSION%
40 |     "%WIN_SDK_ROOT%\%WINDOWS_SDK_VERSION%\Bin\SetEnv.cmd" /x64 /release
41 |     ECHO Executing: %COMMAND_TO_RUN%
42 |     call %COMMAND_TO_RUN% || EXIT 1
43 | ) ELSE (
44 |     ECHO Using default MSVC build environment for 32 bit architecture
45 |     ECHO Executing: %COMMAND_TO_RUN%
46 |     call %COMMAND_TO_RUN% || EXIT 1
47 | )
48 | 
49 | 


--------------------------------------------------------------------------------
/cythonize.dat:
--------------------------------------------------------------------------------
1 | py_entitymatching/debugblocker/debugblocker_cython.pyx NA 5e568768d488850114e2748b2190a9f647c97d66 eeb6f78e85562d50de57173b94a39b7db5ecae0b
2 | 


--------------------------------------------------------------------------------
/docs/how_to_guide.rst:
--------------------------------------------------------------------------------
1 | ==================================
2 | How to Guide To Do Entity Matching
3 | ==================================
4 | 
5 | The initial draft of the how to guide to do entity matching can be found `here. <http://pradap-www.cs.wisc.edu/magellan/how-to-guide/how_to_guide_magellan.pdf>`_ 
6 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | User Manual for py_entitymatching
 3 | =================================
 4 | 
 5 | This document explains how to install, use, and contribute to the package.
 6 | 
 7 | Contents
 8 | ========
 9 | 
10 | .. toctree::
11 |     :maxdepth: 3
12 |     :caption: How To Guide
13 | 
14 |     how_to_guide
15 | 
16 | .. toctree::
17 |     :maxdepth: 3
18 |     :caption: User Manual
19 |     
20 |     user_manual/whatisnew
21 |     user_manual/installation
22 |     user_manual/overview
23 |     user_manual/guides
24 |     user_manual/datastructures
25 |     user_manual/steps_supp_em_workflows
26 |     user_manual/misc
27 | 
28 | .. toctree::
29 |     :maxdepth: 3
30 |     :caption: API Reference
31 | 
32 |     user_manual/api/overview
33 |     user_manual/api/actual_commands
34 | 
35 | .. toctree::
36 |     :maxdepth: 3
37 |     :caption: How to Contribute
38 |     
39 |     contributing
40 | 
41 | Indices and tables
42 | ==================
43 | 
44 | * :ref:`genindex`
45 | * :ref:`modindex`
46 | * :ref:`search`
47 | 


--------------------------------------------------------------------------------
/docs/make_copy.sh:
--------------------------------------------------------------------------------
 1 | cd /Users/pradap/Documents/Research/Python-Package/anhaid/py_entitymatching/docs
 2 | make clean html
 3 | cd _build/html
 4 | scp -r * pradap@trinity.cs.wisc.edu:~/public/html-www/magellan/user_manual/multi_page
 5 | cd /Users/pradap/Documents/Research/Python-Package/anhaid/py_entitymatching/docs
 6 | make clean singlehtml
 7 | cd _build/singlehtml
 8 | scp -r * pradap@trinity.cs.wisc.edu:~/public/html-www/magellan/user_manual/single_page
 9 | cd /Users/pradap/Documents/Research/Python-Package/anhaid/py_entitymatching/docs
10 | 


--------------------------------------------------------------------------------
/docs/themes/nature_with_gtoc/layout.html:
--------------------------------------------------------------------------------
  1 | {#
  2 | 
  3 | Subset of agogo theme
  4 | agogo/layout.html
  5 | 
  6 | Sphinx layout template for the agogo theme, originally written
  7 | by Andi Albrecht.
  8 | 
  9 | :copyright: Copyright 2007-2011 by the Sphinx team, see AUTHORS.
 10 | :license: BSD, see LICENSE for details.
 11 | #}
 12 | {% extends "basic/layout.html" %}
 13 | 
 14 | {%- block content %}
 15 | <div class="content-wrapper">
 16 |     <div class="content">
 17 |         <div class="document">
 18 |             <div class="sphinxsidebar">
 19 |                 {%- block sidebar1 %}
 20 |                 {%- block sidebartoc %}
 21 |                 <h3>{{ _('Table Of Contents') }}</h3>
 22 |                 {{ toctree() }}
 23 |                 {%- endblock %}
 24 |                 {%- block sidebarsearch %}
 25 |                 <h3 style="margin-top: 1.5em;">{{ _('Search') }}</h3>
 26 | 
 27 |                 <form class="search" action="{{ pathto('search') }}" method="get">
 28 |                     <input type="text" name="q" size="18"/>
 29 |                     <input type="submit" value="{{ _('Go') }}"/>
 30 |                     <input type="hidden" name="check_keywords" value="yes"/>
 31 |                     <input type="hidden" name="area" value="default"/>
 32 |                 </form>
 33 |                 <p class="searchtip" style="font-size: 90%">
 34 |                     {{ _('Enter search terms or a module, class or function name.') }}
 35 |                 </p>
 36 | 
 37 |             </div>
 38 |             {%- endblock %}
 39 |             {# possible location for sidebar #} {% endblock %}
 40 | 
 41 | 
 42 |             {%- block document %}
 43 |             <div class="documentwrapper">
 44 |                 {%- if render_sidebar %}
 45 |                 <div class="bodywrapper">
 46 |                     {%- endif %}
 47 |                     <div class="body">
 48 |                         {% block body %} {% endblock %}
 49 |                     </div>
 50 |                     {%- if render_sidebar %}
 51 |                 </div>
 52 |                 {%- endif %}
 53 |             </div>
 54 |             {%- endblock %}
 55 | 
 56 |             {%- block sidebar2 %}
 57 | 
 58 |             {% endblock %}
 59 |             <div class="clearer"></div>
 60 |         </div>
 61 |     </div>
 62 | </div>
 63 | {%- endblock %}
 64 | 
 65 | {%- block footer %}
 66 | <style type="text/css">
 67 |     .scrollToTop {
 68 |         text-align: center;
 69 |         font-weight: bold;
 70 |         position: fixed;
 71 |         bottom: 60px;
 72 |         right: 40px;
 73 |         display: none;
 74 |     }
 75 | </style>
 76 | <a href="#" class="scrollToTop">Scroll To Top</a>
 77 | <script type="text/javascript">
 78 | $(document).ready(function() {
 79 |     //Check to see if the window is top if not then display button
 80 |     $(window).scroll(function() {
 81 |         if ($(this).scrollTop() > 200) {
 82 |             $('.scrollToTop').fadeIn();
 83 |         } else {
 84 |             $('.scrollToTop').fadeOut();
 85 |         }
 86 |     });
 87 | 
 88 |     //Click event to scroll to top
 89 |     $('.scrollToTop').click(function() {
 90 |         $('html, body').animate({
 91 |             scrollTop: 0
 92 |         }, 500);
 93 |         return false;
 94 |     });
 95 | });
 96 | </script>
 97 | <script type="text/javascript">
 98 |   var _gaq = _gaq || [];
 99 |   _gaq.push(['_setAccount', 'UA-27880019-2']);
100 |   _gaq.push(['_trackPageview']);
101 | 
102 |   (function() {
103 |     var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
104 |     ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
105 |     var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
106 |   })();
107 | </script>
108 | {% endblock %}


--------------------------------------------------------------------------------
/docs/themes/nature_with_gtoc/theme.conf:
--------------------------------------------------------------------------------
1 | [theme]
2 | inherit = basic
3 | stylesheet = nature.css
4 | pygments_style = tango
5 | 
6 | [options]
7 | sidebarwidth = 270
8 | 


--------------------------------------------------------------------------------
/docs/user_manual/api/actual_commands.rst:
--------------------------------------------------------------------------------
 1 | =============================
 2 | Commands in py_entitymatching
 3 | =============================
 4 | 
 5 | .. toctree::
 6 |     :maxdepth: 3
 7 | 
 8 |     reading_and_writing_data
 9 |     loading_and_saving_objects
10 |     handling_metadata
11 |     downsampling
12 |     data_exploration
13 |     blocking
14 |     debugging_blocker_output
15 |     combining_blocker_outputs
16 |     sampling
17 |     labeling
18 |     handling_features
19 |     matching
20 |     debugging_matcher
21 |     triggers
22 |     evaluating_the_matching_output
23 | 
24 | =====================
25 | Experimental Commands
26 | =====================
27 | .. toctree::
28 |     :maxdepth: 3
29 | 
30 |     dask_commands
31 |     tuners
32 |     combiner
33 | 
34 | 
35 | 


--------------------------------------------------------------------------------
/docs/user_manual/api/adding_features.rst:
--------------------------------------------------------------------------------
1 | ================================
2 | Adding Features to Feature Table
3 | ================================
4 | .. autofunction:: py_entitymatching.get_feature_fn
5 | .. autofunction:: py_entitymatching.add_feature
6 | .. autofunction:: py_entitymatching.add_blackbox_feature
7 | 


--------------------------------------------------------------------------------
/docs/user_manual/api/blocking.rst:
--------------------------------------------------------------------------------
 1 | ========
 2 | Blocking
 3 | ========
 4 | .. autoclass:: py_entitymatching.AttrEquivalenceBlocker
 5 |     :members:
 6 | .. autoclass:: py_entitymatching.OverlapBlocker
 7 |     :members:
 8 | .. autoclass:: py_entitymatching.RuleBasedBlocker
 9 |     :members:
10 | .. autoclass:: py_entitymatching.BlackBoxBlocker
11 |     :members:
12 | .. autoclass:: py_entitymatching.SortedNeighborhoodBlocker
13 |     :members:
14 | 


--------------------------------------------------------------------------------
/docs/user_manual/api/combiner.rst:
--------------------------------------------------------------------------------
1 | ================
2 | Matcher Combiner
3 | ================
4 | .. autoclass:: py_entitymatching.matchercombiner.matchercombiner.MajorityVote
5 |     :members:
6 | .. autoclass:: py_entitymatching.matchercombiner.matchercombiner.WeightedVote
7 |     :members:
8 | 


--------------------------------------------------------------------------------
/docs/user_manual/api/combining_blocker_outputs.rst:
--------------------------------------------------------------------------------
1 | =========================
2 | Combining Blocker Outputs
3 | =========================
4 | .. autofunction:: py_entitymatching.combine_blocker_outputs_via_union


--------------------------------------------------------------------------------
/docs/user_manual/api/creating_the_features_automatically.rst:
--------------------------------------------------------------------------------
1 | ===================================
2 | Creating the Features Automatically
3 | ===================================
4 | 
5 | .. autofunction:: py_entitymatching.get_features_for_blocking
6 | .. autofunction:: py_entitymatching.get_features_for_matching
7 | 
8 | 


--------------------------------------------------------------------------------
/docs/user_manual/api/creating_the_features_manually.rst:
--------------------------------------------------------------------------------
 1 | ==============================
 2 | Creating the Features Manually
 3 | ==============================
 4 | 
 5 | .. autofunction:: py_entitymatching.get_features
 6 | .. autofunction:: py_entitymatching.get_attr_corres
 7 | .. autofunction:: py_entitymatching.get_attr_types
 8 | .. autofunction:: py_entitymatching.get_sim_funs_for_blocking
 9 | .. autofunction:: py_entitymatching.get_sim_funs_for_matching
10 | .. autofunction:: py_entitymatching.get_tokenizers_for_blocking
11 | .. autofunction:: py_entitymatching.get_tokenizers_for_matching
12 | 


--------------------------------------------------------------------------------
/docs/user_manual/api/dask_commands.rst:
--------------------------------------------------------------------------------
 1 | ===============================
 2 | Commands Implemented Using Dask
 3 | ===============================
 4 | 
 5 | Downsampling
 6 | ------------
 7 | .. autofunction:: py_entitymatching.dask.dask_down_sample.dask_down_sample
 8 | 
 9 | 
10 | Blocking
11 | --------
12 | .. autoclass:: py_entitymatching.dask.dask_attr_equiv_blocker.DaskAttrEquivalenceBlocker
13 |     :members:
14 | 
15 | .. autoclass:: py_entitymatching.dask.dask_overlap_blocker.DaskOverlapBlocker
16 |     :members:
17 | 
18 | .. autoclass:: py_entitymatching.dask.dask_rule_based_blocker.DaskRuleBasedBlocker
19 |     :members:
20 | 
21 | .. autoclass:: py_entitymatching.dask.dask_black_box_blocker.DaskBlackBoxBlocker
22 |     :members:
23 | 
24 | Extracting Feature Vectors
25 | --------------------------
26 | .. autofunction:: py_entitymatching.dask.dask_extract_features.dask_extract_feature_vecs
27 | 
28 | ML-Matchers
29 | -----------
30 | .. autoclass:: py_entitymatching.dask.dask_dtmatcher.DaskDTMatcher
31 |     :inherited-members:
32 |     :exclude-members: __delattr__, __format__, __getattribute__, __hash__, __reduce__, __reduce_ex__, __repr__, __setattr__, __sizeof__, __str__
33 | 
34 | .. autoclass:: py_entitymatching.dask.dask_rfmatcher.DaskRFMatcher
35 |     :inherited-members:
36 |     :exclude-members: __delattr__, __format__, __getattribute__, __hash__, __reduce__, __reduce_ex__, __repr__, __setattr__, __sizeof__, __str__
37 | 
38 | .. autoclass:: py_entitymatching.dask.dask_svm_matcherDaskSVMMatcher
39 |     :inherited-members:
40 |     :exclude-members: __delattr__, __format__, __getattribute__, __hash__, __reduce__, __reduce_ex__, __repr__, __setattr__, __sizeof__, __str__
41 | 
42 | .. autoclass:: py_entitymatching.dask.dask_nbmatcher.DaskNBMatcher
43 |     :inherited-members:
44 |     :exclude-members: __delattr__, __format__, __getattribute__, __hash__, __reduce__, __reduce_ex__, __repr__, __setattr__, __sizeof__, __str__
45 | 
46 | .. autoclass:: py_entitymatching.dask.dask_logregmatcher.DaskLogRegMatcher
47 |     :inherited-members:
48 |     :exclude-members: __delattr__, __format__, __getattribute__, __hash__, __reduce__, __reduce_ex__, __repr__, __setattr__, __sizeof__, __str__
49 | 
50 | 
51 | .. autoclass:: py_entitymatching.dask.dask_xgboost_matcher.DaskXGBoostMatcher
52 |     :inherited-members:
53 |     :exclude-members: __delattr__, __format__, __getattribute__, __hash__, __reduce__, __reduce_ex__, __repr__, __setattr__, __sizeof__, __str__


--------------------------------------------------------------------------------
/docs/user_manual/api/data_exploration.rst:
--------------------------------------------------------------------------------
1 | ================
2 | Data Exploration
3 | ================
4 | .. autoclass:: py_entitymatching.data_explore_openrefine
5 | .. autoclass:: py_entitymatching.data_explore_pandastable
6 | 


--------------------------------------------------------------------------------
/docs/user_manual/api/debugging_blocker_output.rst:
--------------------------------------------------------------------------------
1 | ========================
2 | Debugging Blocker Output
3 | ========================
4 | .. autofunction:: py_entitymatching.debug_blocker
5 | .. autofunction:: py_entitymatching.backup_debug_blocker
6 | 


--------------------------------------------------------------------------------
/docs/user_manual/api/debugging_matcher.rst:
--------------------------------------------------------------------------------
1 | =================
2 | Debugging Matcher
3 | =================
4 | .. autofunction:: py_entitymatching.vis_debug_dt
5 | .. autofunction:: py_entitymatching.vis_debug_rf
6 | .. autofunction:: py_entitymatching.debug_decisiontree_matcher
7 | .. autofunction:: py_entitymatching.debug_randomforest_matcher
8 | 


--------------------------------------------------------------------------------
/docs/user_manual/api/downsampling.rst:
--------------------------------------------------------------------------------
1 | ============
2 | Downsampling
3 | ============
4 | .. autofunction:: py_entitymatching.down_sample


--------------------------------------------------------------------------------
/docs/user_manual/api/evaluating_the_matching_output.rst:
--------------------------------------------------------------------------------
1 | ===============================
2 | Evaluating the Matching Output
3 | ===============================
4 | 
5 | .. autofunction:: py_entitymatching.eval_matches
6 | .. autofunction:: py_entitymatching.print_eval_summary
7 | .. autofunction:: py_entitymatching.get_false_positives_as_df
8 | .. autofunction:: py_entitymatching.get_false_negatives_as_df
9 | 


--------------------------------------------------------------------------------
/docs/user_manual/api/extracting_feature_vectors.rst:
--------------------------------------------------------------------------------
1 | ==========================
2 | Extracting Feature Vectors
3 | ==========================
4 | .. autofunction:: py_entitymatching.extract_feature_vecs


--------------------------------------------------------------------------------
/docs/user_manual/api/handling_features.rst:
--------------------------------------------------------------------------------
 1 | =================
 2 | Handling Features
 3 | =================
 4 | 
 5 | .. toctree::
 6 |     :maxdepth: 2
 7 | 
 8 |     creating_the_features_automatically
 9 |     creating_the_features_manually
10 |     adding_features
11 |     extracting_feature_vectors
12 |     imputing_missing_values
13 |     supported_similarity_functions
14 |     supported_tokenizers


--------------------------------------------------------------------------------
/docs/user_manual/api/handling_metadata.rst:
--------------------------------------------------------------------------------
 1 | .. _label-handling-metadata:
 2 | 
 3 | =================
 4 | Handling Metadata
 5 | =================
 6 | .. autofunction:: py_entitymatching.get_catalog
 7 | .. autofunction:: py_entitymatching.get_catalog_len
 8 | .. autofunction:: py_entitymatching.del_catalog
 9 | .. autofunction:: py_entitymatching.is_catalog_empty
10 | .. autofunction:: py_entitymatching.is_dfinfo_present
11 | .. autofunction:: py_entitymatching.is_property_present_for_df
12 | .. autofunction:: py_entitymatching.show_properties
13 | .. autofunction:: py_entitymatching.show_properties_for_id
14 | .. autofunction:: py_entitymatching.get_property
15 | .. autofunction:: py_entitymatching.set_property
16 | .. autofunction:: py_entitymatching.del_property
17 | .. autofunction:: py_entitymatching.copy_properties
18 | .. autofunction:: py_entitymatching.get_key
19 | .. autofunction:: py_entitymatching.set_key
20 | .. autofunction:: py_entitymatching.get_fk_ltable
21 | .. autofunction:: py_entitymatching.set_fk_ltable
22 | .. autofunction:: py_entitymatching.get_fk_rtable
23 | .. autofunction:: py_entitymatching.set_fk_rtable
24 | .. autofunction:: py_entitymatching.get_ltable
25 | .. autofunction:: py_entitymatching.set_ltable
26 | .. autofunction:: py_entitymatching.get_rtable
27 | .. autofunction:: py_entitymatching.set_rtable
28 | 
29 | 


--------------------------------------------------------------------------------
/docs/user_manual/api/imputing_missing_values.rst:
--------------------------------------------------------------------------------
1 | =======================
2 | Imputing Missing Values
3 | =======================
4 | .. autofunction:: py_entitymatching.impute_table


--------------------------------------------------------------------------------
/docs/user_manual/api/labeling.rst:
--------------------------------------------------------------------------------
1 | ========
2 | Labeling
3 | ========
4 | .. autofunction:: py_entitymatching.label_table
5 | 


--------------------------------------------------------------------------------
/docs/user_manual/api/loading_and_saving_objects.rst:
--------------------------------------------------------------------------------
1 | ============================
2 | Loading and Saving Objects
3 | ============================
4 | .. autofunction:: py_entitymatching.load_table
5 | .. autofunction:: py_entitymatching.save_table
6 | .. autofunction:: py_entitymatching.load_object
7 | .. autofunction:: py_entitymatching.save_object
8 | 


--------------------------------------------------------------------------------
/docs/user_manual/api/matching.rst:
--------------------------------------------------------------------------------
1 | ========
2 | Matching
3 | ========
4 | .. toctree::
5 | 
6 |    splitting_data_into_train_test
7 |    supported_matchers
8 |    selecting_matcher


--------------------------------------------------------------------------------
/docs/user_manual/api/overview.rst:
--------------------------------------------------------------------------------
 1 | ================================
 2 | Overview of Command Organization
 3 | ================================
 4 | 
 5 | The commands are organized into two parts. First, the commands that the user will typically use to
 6 | create an entity matching workflow. Second, a set of experimental commands that are expected to be
 7 | useful to create an entity matching workflow. Specifically, it includes commands  such
 8 | as  dask-based implementations for blockers and combining predictions from a set of matchers.
 9 | However, the experimental commands are not tested, so use these commands at your own risk.
10 | 
11 | 


--------------------------------------------------------------------------------
/docs/user_manual/api/reading_and_writing_data.rst:
--------------------------------------------------------------------------------
1 | =========================
2 | Reading and Writing Data
3 | =========================
4 | .. autofunction:: py_entitymatching.read_csv_metadata
5 | .. autofunction:: py_entitymatching.to_csv_metadata
6 | 


--------------------------------------------------------------------------------
/docs/user_manual/api/sampling.rst:
--------------------------------------------------------------------------------
1 | ========
2 | Sampling
3 | ========
4 | .. autofunction:: py_entitymatching.sample_table


--------------------------------------------------------------------------------
/docs/user_manual/api/selecting_matcher.rst:
--------------------------------------------------------------------------------
1 | ==================
2 | Selecting Matcher
3 | ==================
4 | .. autofunction:: py_entitymatching.select_matcher


--------------------------------------------------------------------------------
/docs/user_manual/api/splitting_data_into_train_test.rst:
--------------------------------------------------------------------------------
1 | ==================================
2 | Splitting Data into Train and Test
3 | ==================================
4 | .. autofunction:: py_entitymatching.split_train_test


--------------------------------------------------------------------------------
/docs/user_manual/api/supported_matchers.rst:
--------------------------------------------------------------------------------
 1 | ==================
 2 | Supported Matchers
 3 | ==================
 4 | 
 5 | ML Matchers
 6 | ===========
 7 | 
 8 | .. autoclass:: py_entitymatching.DTMatcher
 9 |     :inherited-members:
10 |     :exclude-members: __delattr__, __format__, __getattribute__, __hash__, __reduce__, __reduce_ex__, __repr__, __setattr__, __sizeof__, __str__
11 | 
12 | .. autoclass:: py_entitymatching.RFMatcher
13 |     :inherited-members:
14 |     :exclude-members: __delattr__, __format__, __getattribute__, __hash__, __reduce__, __reduce_ex__, __repr__, __setattr__, __sizeof__, __str__
15 | 
16 | .. autoclass:: py_entitymatching.SVMMatcher
17 |     :inherited-members:
18 |     :exclude-members: __delattr__, __format__, __getattribute__, __hash__, __reduce__, __reduce_ex__, __repr__, __setattr__, __sizeof__, __str__
19 | 
20 | .. autoclass:: py_entitymatching.NBMatcher
21 |     :inherited-members:
22 |     :exclude-members: __delattr__, __format__, __getattribute__, __hash__, __reduce__, __reduce_ex__, __repr__, __setattr__, __sizeof__, __str__
23 | 
24 | .. autoclass:: py_entitymatching.LinRegMatcher
25 |     :inherited-members:
26 |     :exclude-members: __delattr__, __format__, __getattribute__, __hash__, __reduce__, __reduce_ex__, __repr__, __setattr__, __sizeof__, __str__
27 | 
28 | 
29 | .. autoclass:: py_entitymatching.LogRegMatcher
30 |     :inherited-members:
31 |     :exclude-members: __delattr__, __format__, __getattribute__, __hash__, __reduce__, __reduce_ex__, __repr__, __setattr__, __sizeof__, __str__
32 | 
33 | 
34 | .. autoclass:: py_entitymatching.XGBoostMatcher
35 |     :inherited-members:
36 |     :exclude-members: __delattr__, __format__, __getattribute__, __hash__, __reduce__, __reduce_ex__, __repr__, __setattr__, __sizeof__, __str__
37 | 
38 | 
39 | Rule-Based Matcher
40 | ==================
41 | 
42 |  .. autoclass:: py_entitymatching.BooleanRuleMatcher
43 |     :inherited-members:
44 |     :excluded-members: __delattr__, __format__, __getattribute__, __hash__, __reduce__, __reduce_ex__, __repr__, __setattr__, __sizeof__, __str__
45 | 


--------------------------------------------------------------------------------
/docs/user_manual/api/supported_similarity_functions.rst:
--------------------------------------------------------------------------------
 1 | ==============================
 2 | Supported Similarity Functions
 3 | ==============================
 4 | 
 5 | .. autofunction:: py_entitymatching.affine
 6 | .. autofunction:: py_entitymatching.hamming_dist
 7 | .. autofunction:: py_entitymatching.hamming_sim
 8 | .. autofunction:: py_entitymatching.lev_dist
 9 | .. autofunction:: py_entitymatching.lev_sim
10 | .. autofunction:: py_entitymatching.jaro
11 | .. autofunction:: py_entitymatching.jaro_winkler
12 | .. autofunction:: py_entitymatching.needleman_wunsch
13 | .. autofunction:: py_entitymatching.smith_waterman
14 | .. autofunction:: py_entitymatching.jaccard
15 | .. autofunction:: py_entitymatching.cosine
16 | .. autofunction:: py_entitymatching.overlap_coeff
17 | .. autofunction:: py_entitymatching.dice
18 | .. autofunction:: py_entitymatching.monge_elkan
19 | .. autofunction:: py_entitymatching.exact_match
20 | .. autofunction:: py_entitymatching.rel_diff
21 | .. autofunction:: py_entitymatching.abs_norm


--------------------------------------------------------------------------------
/docs/user_manual/api/supported_tokenizers.rst:
--------------------------------------------------------------------------------
1 | ====================
2 | Supported Tokenizers
3 | ====================
4 | .. autofunction:: py_entitymatching.tok_qgram
5 | .. autofunction:: py_entitymatching.tok_delim
6 | .. autofunction:: py_entitymatching.tok_wspace
7 | .. autofunction:: py_entitymatching.tok_alphabetic
8 | .. autofunction:: py_entitymatching.tok_alphanumeric


--------------------------------------------------------------------------------
/docs/user_manual/api/triggers.rst:
--------------------------------------------------------------------------------
1 | ========
2 | Triggers
3 | ========
4 | .. autoclass:: py_entitymatching.MatchTrigger
5 |     :members:
6 | 


--------------------------------------------------------------------------------
/docs/user_manual/api/tuners.rst:
--------------------------------------------------------------------------------
 1 | ==================================
 2 | Tuners for the Dask-based Commands
 3 | ==================================
 4 | 
 5 | Downsampling
 6 | ------------
 7 | .. autofunction:: py_entitymatching.tuner.tuner_down_sample.tuner_down_sample
 8 | 
 9 | 
10 | Overlap Blocker
11 | ---------------
12 | .. autofunction:: py_entitymatching.tuner.tuner_overlap_blocker.tuner_overlap_blocker
13 | 


--------------------------------------------------------------------------------
/docs/user_manual/blocker_hierarchy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhaidgroup/py_entitymatching/0eebc5823a5faac89934c52cfe2b7cb5f6e15e8a/docs/user_manual/blocker_hierarchy.png


--------------------------------------------------------------------------------
/docs/user_manual/create_feats_for_matching.rst:
--------------------------------------------------------------------------------
 1 | .. _label-create-feats-matching:
 2 | 
 3 | ==============================
 4 | Creating Features for Matching
 5 | ==============================
 6 | If you have to use supervised learning-based matchers, then you cannot just operate on the
 7 | labeled set of tuple pairs. For each tuple in the labeled, you need to convert it
 8 | into a feature vector which consists of a list of numerical/categorical features. To do
 9 | this, first we need to create a set of features.
10 | 
11 | There are two ways to create features:
12 | 
13 | * Automatically create a set of features (then the user can remove or add some more).
14 | * Skip the automatic process and generate features manually.
15 | 
16 | 
17 | Creating the Features Manually
18 | ------------------------------
19 | This is very similar to manual feature creation process for blocking (see section
20 | :ref:`label-create-features-blocking`) except the features are created for
21 | matching purposes.
22 | In brief, you can execute the following sequence of commands in py_entitymatching
23 | to create the features manually:
24 | 
25 |     >>> match_t = em.get_tokenizers_for_matching()
26 |     >>> match_s = em.get_sim_funs_for_matching()
27 |     >>> atypes1 = em.get_attr_types(A) # don't need, if atypes1 exists from blocking step
28 |     >>> atypes2 = em.get_attr_types(B) # don't need, if atypes2 exists from blocking step
29 |     >>> match_c = em.get_attr_corres(A, B)
30 |     >>> match_f = em.get_features(A, B, atypes1, atype2, match_c, match_t, match_s)
31 | 
32 | Further, you can add or delete features as see saw in section
33 | :ref:`label-add-remove-features`.
34 | 
35 | Please refer to the API reference of :py:meth:`~py_entitymatching.get_tokenizers_for_matching`
36 | and :py:meth:`py_entitymatching.get_sim_funs_for_matching` for more details.
37 | 
38 | .. note:: Currently, py_entitymatching returns the same set of features for blocking and matching purposes.
39 | 
40 | Creating the Features Automatically
41 | -----------------------------------
42 | If you do not want to go through the hassle of creating the features manually, then
43 | the user can generate the features automatically. This is very similar to automatic
44 | feature creation process for blocking (see section :ref:`label-gen-feats-automatically`).
45 | 
46 | In py_entitymatching, you can use `get_features_for_matching` to generate features
47 | for matching purposes automatically. An example of using `get_features_for_matching` is
48 | shown below:
49 | 
50 |     >>> match_f = em.get_features_for_matching(A, B)
51 | 
52 | Similar to what we saw in section :ref:`label-gen-feats-automatically` for blocking, the
53 | command will set the following variables: `_match_t`, `_match_s`, `_atypes1`, `_atypes2`, `_match_c`
54 | and they can be accessed like this:
55 | 
56 |     >>> em._match_t
57 |     >>> em._match_s
58 |     >>> em._atypes1
59 |     >>> em._atypes2
60 |     >>> em._match_c
61 | 
62 | You can to examine these variables, modify them as appropriate, and then
63 | perhaps regenerate a set of features.
64 | Please refer to the API reference of :py:meth:`~py_entitymatching.get_features_for_matching`
65 | for more details.
66 | 
67 | 
68 | 
69 | 
70 | 
71 | 


--------------------------------------------------------------------------------
/docs/user_manual/data_exploration.rst:
--------------------------------------------------------------------------------
 1 | ================
 2 | Data Exploration
 3 | ================
 4 | 
 5 | Data exploration is an important part of the entity matching workflow because it
 6 | gives the user a chance to look at the actual data closely. Data exploration
 7 | allows the user to inspect the individual records and features present in the
 8 | table so that he or she can understand the important trends and relationships
 9 | present in the data. A complete understanding of the data gives the user an
10 | advantage later on in the entity matching workflow.
11 | 
12 | 
13 | OpenRefine
14 | ----------
15 | 
16 | OpenRefine is a data exploration tool that is compatible with Python >= 2.7 or
17 | Python >= 3.4. More information about OpenRefine can be found at its github page
18 | at https://github.com/OpenRefine/OpenRefine
19 | 
20 | 
21 | .. note::
22 |     OpenRefine is not included with py_entitymatching and must be downloaded and
23 |     installed separately. The installation instructions can be found at
24 |     https://github.com/OpenRefine/OpenRefine/wiki/Installation-Instructions
25 | 
26 | Using OpenRefine
27 | ~~~~~~~~~~~~~~~~
28 | 
29 | Before using OpenRefine, you must start the application to start an OpenRefine
30 | server. The explanations for doing so are explained after the installation
31 | instructions at https://github.com/OpenRefine/OpenRefine/wiki/Installation-Instructions
32 | 
33 | Once the application has created a server, copy the URL from the address bar of
34 | the OpenRefine browser (default is http://127.0.0.1:3333 ). Then the data can
35 | be explored as in the example below:
36 | 
37 | 
38 |     >>> import py_entitymatching as em
39 |     >>> A = em.read_csv_metadata('path_to_csv_dir/table.csv', key='ID')
40 |     >>> p = em.data_explore_openrefine(A, name='Table')
41 |     >>> # Save the project back to our dataframe
42 |     >>> # Calling export_pandas_frame will automatically delete the OpenRefine project
43 |     >>> df = p.export_pandas_frame()
44 | 
45 | 
46 | Pandastable
47 | -----------
48 | Pandastable is a data exploration tool available for python >=3.4 that allows users
49 | to view and manipulate data. More information about pandastable can be found at
50 | https://github.com/dmnfarrell/pandastable
51 | 
52 | .. note::
53 |     pandastable is not packaged along with py_entitymatching. You can install
54 |     pandastable using pip as show below:
55 | 
56 |         $ pip install pandastable
57 | 
58 |     or conda as shown below:
59 | 
60 |         $ conda install -c dmnfarrell pandastable=0.7.1
61 | 
62 | 
63 | 
64 | Using pandastable
65 | ~~~~~~~~~~~~~~~~~
66 | 
67 | 
68 | Pandastable can be easily be used with the wrappers included with py_entitymatching.
69 | The following example shows how:
70 | 
71 |     >>> # import py_entitymatching
72 |     >>> import py_entitymatching as em
73 |     >>> # Explore the data using pandastable
74 |     >>> A = em.read_csv_metadata('path_to_csv_dir/table.csv', key='ID')
75 |     >>> em.data_explore_pandastable(A)
76 | 


--------------------------------------------------------------------------------
/docs/user_manual/debugging_blocking.rst:
--------------------------------------------------------------------------------
 1 | ==================
 2 | Debugging Blocking
 3 | ==================
 4 | In a typical entity matching workflow, you will load in the two tables to
 5 | match, sample them (if required) and use a blocker to remove obvious non-matches.
 6 | But it is often not clear whether the blocker drops only non-matches or it
 7 | also removes a lot of potential matches.
 8 | 
 9 | In such cases, it is important to debug the output of blocker. In
10 | py_entitymatching, `debug_blocker` command can be used for that purpose.
11 | 
12 | The `debug_blocker` command takes in two input tables A, B, blocker output C
13 | and returns a table D containing a set of tuple pairs that are
14 | potential matches and yet are not present in the blocker output
15 | C. Table D also contains similarity measure computed for each reported
16 | tuple pair (as its second column).
17 | 
18 | You can examine these potential matches in table D. If you
19 | find that many of them are indeed true matches, then that means the
20 | blocker may have removed too many true matches. In this case you
21 | may want to `relax` the blocker by modifying its parameters, or
22 | choose a different blocker. On the other hand, if you do not
23 | find many true matches in table D, then it could be the case that the
24 | blocker has done a good job and preserve all the matches (or most of
25 | the matches) in the blocker output C.
26 | 
27 | In the `debug_blocker`, you can optionally specify attribute correspondences between
28 | the input tables A and B. If it is not specified, then attribute correspondences
29 | will be a list of attribute pairs with the exact same names in A and B.
30 | 
31 | The debugger will use only the attributes mentioned in these attribute
32 | correspondences to try to find potentially matching pairs and place
33 | those pairs into D. Thus, our recommendation is that (a) if the tables
34 | have idential schemas or share a lot of attributes with the same
35 | names, then do not specify the attribute correspondences, in this
36 | case the debugger will use all the attributes with the same name between the two
37 | schemas, (b) otherwise think about what attribute pairs you want to see the
38 | debugger use, then specify those as attribute correspondences.
39 | 
40 | An example of using `debug_blocker` is shown below:
41 | 
42 |     >>> import py_entitymatching as em
43 |     >>> ob = em.OverlapBlocker()
44 |     >>> C = ob.block_tables(A, B, l_overlap_attr='title', r_overlap_attr='title', overlap_size=3)
45 |     >>> corres = [('ID','ssn'), ('name', 'ename'), ('address', 'location'),('zipcode', 'zipcode')]
46 |     >>> D = em.debug_blocker(C, A, B, attr_corres=corres)
47 | 
48 | Please refer to the API reference of :py:meth:`~py_entitymatching.debug_blocker`
49 | for more details.
50 | 
51 | The blocker debugger is implemented in Cython. In case this version of the 
52 | command is not working properly, there is also a python version of the command, 
53 | called `backup_debug_blocker`, available that can be used instead. Please refer 
54 | to the API reference of :py:meth:`~py_entitymatching.backup_debug_blocker` for
55 | more details.
56 | 
57 | 


--------------------------------------------------------------------------------
/docs/user_manual/down_sampling.rst:
--------------------------------------------------------------------------------
 1 | =============
 2 | Down Sampling
 3 | =============
 4 | Once the tables to be matched are read, they must be down sampled if the number of
 5 | tuples in them are large (for example, 100K+ tuples). This is because working with
 6 | large tables can be very time consuming (as any operation performed would have
 7 | to process these large tables).
 8 | 
 9 | Random sampling however does not work, because the sampled may end up sharing very
10 | few matches, especially if the number of matches between the
11 | input tables are small to begin with.
12 | 
13 | In py_entitymatching, you can use sample the input tables using `down_sample` command.
14 | This command samples the input tables intelligently that ensures a reasonable number of
15 | matches between them.
16 | 
17 | If `A` and `B` are the input tables, then you can use `down_sample` command as shown
18 | below:
19 | 
20 | >>> sample_A, sample_B = em.down_sample(A, B, size=500, y_param=1)
21 | 
22 | Conceptually, the command takes in two original input tables, `A`, `B` (and some parameters),
23 | and produces two sampled tables, `sample_A` and `sample_B`.
24 | Specifically, you must set the `size` to be the number of tuples that
25 | should be sampled from `B` (this will be the size of `sample_B` table) and set the
26 | `y_param` to be the number of tuples to be selected from `A` (for each tuple in
27 | `sample_B` table). The command internally uses a
28 | heuristic to ensure a reasonable number of matches between `sample_A` and `sample_B`.
29 | 
30 | Please look at the API reference of :py:meth:`~py_entitymatching.down_sample` for more
31 | details.
32 | 
33 | .. note:: Currently, the input tables must be loaded in memory before the user can down
34 |  sample.
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/docs/user_manual/evaluate_matching.rst:
--------------------------------------------------------------------------------
 1 | ==============================
 2 | Evaluating the Matching Output
 3 | ==============================
 4 | Once you have predicted matches using ML-based matcher, then you would have to
 5 | evaluate the matches. py_entitymatching supports `eval_matches` command for that
 6 | purpose.
 7 | 
 8 | An example of using `eval_matches` command is shown below:
 9 | 
10 |     >>> H = em.extract_feat_vecs(G, feat_table=match_f, attrs_after='gold_labels')
11 |     >>> dt = em.DTMatcher()
12 |     >>> dt.fit(table=H, exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_labels'], target_attr='gold_labels')
13 |     >>> pred_table = dt.predict(table=H,  exclude_attrs=['_id', 'ltable_id', 'rtable_id', 'gold_labels'],  append=True, target_attr='predicted_labels')
14 |     >>> eval_summary = em.eval_matches(pred_table, 'gold_labels', 'predicted_labels')
15 | 
16 | In the above, `eval_summary` is a dictionary containing accuracy numbers (such as
17 | precision, recall, F1, etc) and the list of false positives/negatives.
18 | 
19 | Please refer to the API reference of :py:meth:`~py_entitymatching.eval_matches` for
20 | more details.
21 | 
22 | 


--------------------------------------------------------------------------------
/docs/user_manual/example-blocking-matching.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhaidgroup/py_entitymatching/0eebc5823a5faac89934c52cfe2b7cb5f6e15e8a/docs/user_manual/example-blocking-matching.png


--------------------------------------------------------------------------------
/docs/user_manual/example-dev-stage.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhaidgroup/py_entitymatching/0eebc5823a5faac89934c52cfe2b7cb5f6e15e8a/docs/user_manual/example-dev-stage.png


--------------------------------------------------------------------------------
/docs/user_manual/example-match-two-tables.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhaidgroup/py_entitymatching/0eebc5823a5faac89934c52cfe2b7cb5f6e15e8a/docs/user_manual/example-match-two-tables.png


--------------------------------------------------------------------------------
/docs/user_manual/example-prod-stage.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhaidgroup/py_entitymatching/0eebc5823a5faac89934c52cfe2b7cb5f6e15e8a/docs/user_manual/example-prod-stage.png


--------------------------------------------------------------------------------
/docs/user_manual/example-workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhaidgroup/py_entitymatching/0eebc5823a5faac89934c52cfe2b7cb5f6e15e8a/docs/user_manual/example-workflow.png


--------------------------------------------------------------------------------
/docs/user_manual/extract_feat_vecs.rst:
--------------------------------------------------------------------------------
 1 | ==========================
 2 | Extracting Feature Vectors
 3 | ==========================
 4 | Once you have created a set of features, you use them to convert labeled sample to feature
 5 | vectors. In py_entitymatching, you can use `extract_feature_vecs` to convert
 6 | labeled sample to feature vectors using the features created
 7 | (see section :ref:`label-create-feats-matching`).
 8 | 
 9 | An example of using `extract_feature_vecs` is shown below:
10 | 
11 |     >>> H = em.extract_feature_vecs(G, feature_table=match_f, attrs_before=['title'], attrs_after=['gold_labels'])
12 | 
13 | Conceptually, the command takes the labeled data (`G`), applies the feature functions (in `match_f`)
14 | to each tuple in G to create a Dataframe, adds the `attrs_before` and `attrs_after`
15 | columns, updates the metadata and returns the resulting Dataframe.
16 | 
17 | If there is one (or several columns) in labeled data that contains the labels, then those need
18 | to be explicitly specified in `attrs_after`, if you want them them to copy over.
19 | 
20 | Please refer to the API reference of :py:meth:`~py_entitymatching.extract_feature_vecs`
21 | for more details.
22 | 


--------------------------------------------------------------------------------
/docs/user_manual/labeling.rst:
--------------------------------------------------------------------------------
 1 | ========
 2 | Labeling
 3 | ========
 4 | The command `label_table` can be used to label the samples (see section
 5 | :ref:`label-sampling`). An example of using `label_table` is shown below:
 6 | 
 7 |     >>> G = em.label_table(S, label_column_name='gold_labels')
 8 | 
 9 | The above command will first create a copy of the input table `S`, update
10 | the metadata, add a column with the
11 | specified column name (in `label_col_name` parameter) fill it with 0 (i.e non-matches)
12 | and open a GUI for you to update the labels. You must specify 0 for non-matches and
13 | 1 for matches. Once you close the GUI, the updated table will be returned.
14 | 
15 | Please refer to the API reference of :py:meth:`~py_entitymatching.label_table`
16 | for more details.
17 | 


--------------------------------------------------------------------------------
/docs/user_manual/matchercombiner.rst:
--------------------------------------------------------------------------------
 1 | ============================================
 2 | Combining Predictions from Multiple Matchers
 3 | ============================================
 4 | In the matching step, if you use multiple matchers then you will have to combine the
 5 | predictions from them to get a consolidated prediction. There are many different ways
 6 | to combine these predictions such as weighted vote, majority vote, stacking, etc.
 7 | Currently, py_entitymatching supports majority and weighted voting-based combining.
 8 | These combiners are experimental and not tested.
 9 | 
10 | An example of using majority voting-based combining is shown below.
11 | 
12 |     >>> dt = DTMatcher()
13 |     >>> rf = RFMatcher()
14 |     >>> nb = NBMatcher()
15 |     >>> dt.fit(table=H, exclude_attrs=['_id', 'l_id', 'r_id'], target_attr='label') # H is training set containing feature vectors
16 |     >>> dt.predict(table=L, exclude_attrs=['id', 'l_id', 'r_id'], append=True, inplace=True, target_attr='dt_predictions') # L is the test set for which we should get predictions.
17 |     >>> rf.fit(table=H, exclude_attrs=['_id', 'l_id', 'r_id'], target_attr='label')
18 |     >>> rf.predict(table=L, exclude_attrs=['id', 'l_id', 'r_id'], append=True, inplace=True, target_attr='rf_predictions')
19 |     >>> nb.fit(table=H, exclude_attrs=['_id', 'l_id', 'r_id'], target_attr='label')
20 |     >>> nb.predict(table=L, exclude_attrs=['id', 'l_id', 'r_id'], append=True, inplace=True, target_attr='nb_predictions')
21 |     >>> mv_combiner = MajorityVote()
22 |     >>> L['consol_predictions'] = mv_combiner.combine(L[['dt_predictions', 'rf_predictions', 'nb_predictions']])
23 | 
24 | Conceptually, given a list of predictions (from different matchers) the prediction that
25 | occurs most is returned as the consolidated prediction. If there is no clear winning
26 | prediction (for example, 0 and 1 occuring equal number of times) then 0 is returned.
27 | 
28 | An example of using weighted voting-based combining is shown below.
29 | 
30 | 
31 |     >>> dt = DTMatcher()
32 |     >>> rf = RFMatcher()
33 |     >>> nb = NBMatcher()
34 |     >>> dt.fit(table=H, exclude_attrs=['_id', 'l_id', 'r_id'], target_attr='label') # H is training set containing feature vectors
35 |     >>> dt.predict(table=L, exclude_attrs=['id', 'l_id', 'r_id'], append=True, inplace=True, target_attr='dt_predictions') # L is the test set for which we should get predictions.
36 |     >>> rf.fit(table=H, exclude_attrs=['_id', 'l_id', 'r_id'], target_attr='label')
37 |     >>> rf.predict(table=L, exclude_attrs=['id', 'l_id', 'r_id'], append=True, inplace=True, target_attr='rf_predictions')
38 |     >>> nb.fit(table=H, exclude_attrs=['_id', 'l_id', 'r_id'], target_attr='label')
39 |     >>> nb.predict(table=L, exclude_attrs=['id', 'l_id', 'r_id'], append=True, inplace=True, target_attr='nb_predictions')
40 |     >>> wv_combiner = WeightedVote(weights=[0.3, 0.2, 0.1], threshold=0.4)
41 |     >>> L['consol_predictions'] = wv_combiner.combine(L[['dt_predictions',
42 |     'rf_predictions', 'nb_predictions']])
43 | 
44 | Conceptually, given a list of predictions, each prediction is given a
45 | weight, we compute a weighted sum of these predictions and compare the result to a
46 | threshold. If the result is greater than or equal to the threshold then the
47 | consolidated prediction is returned as 1 (i.e., a match) else returned as 0 (no-match).
48 | 


--------------------------------------------------------------------------------
/docs/user_manual/misc.rst:
--------------------------------------------------------------------------------
 1 | =============
 2 | Miscellaneous
 3 | =============
 4 | This section covers some miscellaneous things in py_entitymatching.
 5 | 
 6 | .. _label-csv-format:
 7 | 
 8 | CSV Format
 9 | ----------
10 | The CSV format is selected because it’s well known and can be read by numerous external
11 | programs. Further, it can be easily inspected and edited by the users.
12 | You can read more about CSV formats `here <https://en.wikipedia.org/wiki/Comma-separated_values>`_.
13 | 
14 | There are two common CSV formats that are used to store CSV files: one with attribute
15 | names in the first line, and one without. Both these formats are supported by py_entitymatching.
16 | 
17 | An example of a CSV file with attribute names is shown below:
18 | ::
19 | 
20 |     ID, name, birth_year, hourly_wage, zipcode
21 |     a1, Kevin Smith, 1989, 30, 94107
22 |     a2, Michael Franklin, 1988, 27.5, 94122
23 |     a3, William Bridge, 1988, 32, 94321
24 | 
25 | An example of a CSV file with out attribute names is shown below:
26 | 
27 | ::
28 | 
29 |     a1, Kevin Smith, 1989, 30, 94107
30 |     a2, Michael Franklin, 1988, 27.5, 94122
31 |     a3, William Bridge, 1988, 32, 94321
32 | 
33 | Metadata File Format
34 | --------------------
35 | The CSV file can be accompanied with a metadata file containing the metadata information
36 | of the table. Typically, it contains information such as key, foreign key, etc.
37 | The metadata file is expected to be of the same name as the CSV file but with `.metadata`
38 | extension. For example, if the CSV file `table_A.csv` contains table A's data, then
39 | `table_A.metadata` will contain table A's metadata. So, the metadata is
40 | associated based on the names of the files. The metadata file contains key-value pairs
41 | one per line and each line starts with '#'.
42 | 
43 | An example of metadata file is shown below:
44 | 
45 | ::
46 | 
47 |     #key=ID
48 | 
49 | In the above, the pair key=ID states that ID is the key attribute.
50 | 
51 | Writing a Dataframe to Disk Along With Its Metadata
52 | ---------------------------------------------------
53 | To write a Dataframe to disk along with its metadata, you can use `to_csv_metadata`
54 | command in py_entitymatching. An example of using `to_csv_metadata` is shown below:
55 | 
56 |     >>> em.to_csv_metadata(A, './table_A.csv')
57 | 
58 | The above command will first write Dataframe pointed by `A` to `table_A.csv` file in the
59 | disk (in CSV format), next it will write the metadata of `table A` stored in the Catalog
60 | to `table_A.metadata` file in the disk.
61 | 
62 | Please refer to the API reference of :py:meth:`~py_entitymatching.to_csv_metadata` for
63 | more details.
64 | 
65 | .. note:: Once the Dataframe is written to disk along with metadata, it can read using :py:meth:`~py_entitymatching.read_csv_metadata` command.
66 | 
67 | 
68 | Writing/Reading Other Types of py_entitymatching Objects
69 | ----------------------------------------------------------
70 | After creating a blocker or feature table, it is desirable to have a
71 | way to persist the objects to disk for future use. py_entitymatching provides
72 | two commands for that purpose: `save_object` and `load_object`.
73 | 
74 | An example of using `save_object` is shown below:
75 | 
76 |     >>> block_f = em.get_features_for_blocking(A, B)
77 |     >>> rb = em.RuleBasedBlocker()
78 |     >>> rb.add_rule([name_name_lev(ltuple, rtuple) < 0.4], block_f)
79 |     >>> em.save_object(rb, './rule_based_blocker.pkl')
80 | 
81 | `load_object` loads the stored object from disk. An example of using `load_object` is
82 | shown below:
83 | 
84 |     >>> rb = em.load_object('./rule_based_blocker.pkl')
85 | 
86 | Please refer to the API reference of :py:meth:`~py_entitymatching.save_object` and
87 | :py:meth:`~py_entitymatching.save_object` for more details.
88 | 


--------------------------------------------------------------------------------
/docs/user_manual/profiling.rst:
--------------------------------------------------------------------------------
 1 | ==============
 2 | Profiling Data
 3 | ==============
 4 | Profiling data is used to help users get general information about their data.
 5 | Before working with the data, it is useful for a user to have a high level
 6 | understanding of the data because he or she will be able to take advantage of
 7 | the the general trends to successfully and efficiently complete the rest of
 8 | the workflow.
 9 | 
10 | Data profiling specifically can show users important statistics such as type,
11 | uniqueness, missing values, quartile statistics, mean, mode, standard deviation,
12 | sum, median absolute deviation, coefficient of variation, kurtosis, skewness.
13 | It can also display information to the user visually such as in a histogram.
14 | 
15 | We recommend using the python package pandas-profiling because it is simple
16 | and easy to use. More information about the package can be found on the github
17 | page at https://github.com/JosPolfliet/pandas-profiling
18 | 
19 | 
20 | Example Usage
21 | -------------
22 | After reading in a CSV file into a Dataframe, pandas-profiling shows the user a
23 | report containing useful profiling information. For example:
24 | 
25 | 
26 |     >>> import pandas_profiling
27 |     >>> # Read in csv file
28 |     >>> A = em.read_csv_metadata('path_to_csv_dir/table.csv', key='ID')
29 |     >>> # Use the profiler
30 |     >>> pandas_profiling.ProfileReport(A)
31 | 
32 | The user can also check to see if any variables are highly correlated:
33 | 
34 |     >>> # Read in csv file
35 |     >>> import pandas_profiling
36 |     >>> A = em.read_csv_metadata('path_to_csv_dir/table.csv', key='ID')
37 |     >>> #Use the profiler
38 |     >>> profile = pandas_profiling.ProfileReport(A)
39 |     >>> # Check for rejected variables
40 |     >>> rejected_variables = profile.get_rejected_variables(threshold=0.9)
41 | 
42 | The report generated can also be saved into an html file:
43 | 
44 | 
45 |     >>> import pandas_profiling
46 |     >>> A = em.read_csv_metadata('path_to_csv_dir/table.csv', key='ID')
47 |     >>> # Save report to a variable
48 |     >>> profile = pandas_profiling.ProfileReport(A)
49 |     >>> # Save report to an html file
50 |     >>> profile.to_file(outputfile="/tmp/myoutputfile.html")
51 | 
52 | For more information about pandas-profiling please go to the github page
53 | at https://github.com/JosPolfliet/pandas-profiling


--------------------------------------------------------------------------------
/docs/user_manual/read_csv_files.rst:
--------------------------------------------------------------------------------
 1 | ===============================
 2 | Reading the CSV Files from Disk
 3 | ===============================
 4 | Currently, py_entitymatching only asupports reading CSV files from disk.
 5 | 
 6 | **The Minimal That You Should Do:** First, you must store the input tables as CSV files
 7 | in disk. Please look at section :ref:`label-csv-format` to learn more
 8 | about CSV format. An example of a CSV file will look like this:
 9 | 
10 | ::
11 | 
12 |     ID, name, birth_year, hourly_wage, zipcode
13 |     a1, Kevin Smith, 1989, 40, 94107
14 |     a2, Michael Franklin, 1988, 27.5, 94122
15 |     a3, William Bridge, 1988, 32, 94121
16 | 
17 | Next, each table in py_entitymatching must have a key column. If the table already
18 | has a key column, then you can read the CSV file and set the key column as like this:
19 | 
20 | ::
21 | 
22 |     # ID is the key column in table.csv
23 |     >>> A = em.read_csv_metadata('path_to_csv_dir/table.csv', key='ID')
24 | 
25 | If the table does not have a key column, then you can read the CSV file, add a
26 | key column and set the added key column like this:
27 | 
28 | ::
29 | 
30 |     # Read the CSV file
31 |     >>> A = em.read_csv_metadata('path_to_csv_dir/table.csv')
32 |     # Add a key column with name 'ID'
33 |     >>> A['ID'] = range(0, len(A))
34 |     # Set 'ID' as the key column
35 |     >>> em.set_key(A, 'ID')
36 | 
37 | **If You Want to Read and Play Around More:** In general, the command
38 | :py:meth:`~py_entitymatching.read_csv_metadata` looks for a file (with the same file name
39 | as the `CSV` file) with `.metadata` extension in the same directory containing the
40 | metadata. If the file containing metadata information is not present, then
41 | :py:meth:`~py_entitymatching.read_csv_metadata` will proceed just reading the CSV file
42 | as mentioned in the command.
43 | 
44 | To update the metadata for a table, using a metadata file, first, you must manually create
45 | this file and specify the metadata for a table and then call
46 | :py:meth:`~py_entitymatching.read_csv_metadata`. The command will automatically read the metadata from the
47 | file and update the Catalog.
48 | 
49 | For example, if you read `table.csv` then :py:meth:`~py_entitymatching.read_csv_metadata`
50 | looks for `table.metadata` file. The contents of `table.metadata` may look like this:
51 | ::
52 | 
53 |     #key=ID
54 | 
55 | Each line in the file starts with `#`. The metadata is written as `key=value` pairs,
56 | one in each line. The contents of the above file says that `ID` is the key attribute
57 | (for the table in the file `table.csv`).
58 | 
59 | 
60 | The table mentioned in the above example along with the metadata file
61 | stored in the same directory can be read as follows:
62 | 
63 |     >>> import py_entitymatching as em
64 |     >>> A = em.read_csv_metadata('path_to_csv_dir/table.csv')
65 | 
66 | Once, the table is read, you can check to see which
67 | attribute of the table is a key using :py:meth:`~py_entitymatching.get_key` command as
68 | shown below:
69 | 
70 | 
71 |     >>> em.get_key(A)
72 |        'ID'
73 | 
74 | As you see, the key for the table is updated correctly as 'ID'.
75 | 
76 | See :py:meth:`~py_entitymatching.read_csv_metadata` for more details.
77 | 


--------------------------------------------------------------------------------
/docs/user_manual/sampling.rst:
--------------------------------------------------------------------------------
 1 | .. _label-sampling:
 2 | 
 3 | ========
 4 | Sampling
 5 | ========
 6 | If you have to use supervised learning-based matchers or evaluate matchers, you need to
 7 | create labeled data. To create labeled data, first you need to sample of candidate set
 8 | pairs and then label them.
 9 | 
10 | In *py_stringmatching*, you can use `sample_table` to get a sample. The command does
11 | uniform random sampling without replacement. An example of using `sample_table` is shown
12 | below:
13 | 
14 |     >>> S = em.sample_table(C, 100)
15 | 
16 | The command will first create a copy of the input table, sample the specified number of
17 | tuple pairs from the copy, update the metadata and return the sampled table.
18 | 
19 | 
20 | For more details, please look into the API reference of :py:meth:`~py_entitymatching.sample_table`


--------------------------------------------------------------------------------
/docs/user_manual/select_best_matcher.rst:
--------------------------------------------------------------------------------
 1 | ======================
 2 | Selecting a ML-Matcher
 3 | ======================
 4 | Once you have created different concrete ML matchers, then you have to choose one of
 5 | them for matching purposes. There are many different criteria by which one can
 6 | decide to choose a matcher such as `akaike information criterion`, `bayesian information
 7 | criterion`, `k-fold cross validation`, etc. Currently py_entitymatching supports
 8 | k-fold cross validation and other approaches are left for future work.
 9 | 
10 | Conceptually, the command to select a matcher would take in the following inputs:
11 | 
12 | * List of ML matchers.
13 | * Training data (feature vector).
14 | * A column of labels that correspond to the feature vectors in the training data.
15 | * Number of folds.
16 | 
17 | And it would produce the following output:
18 | 
19 | * Selected matcher.
20 | * Statistics such as mean accuracy of all input matchers.
21 | 
22 | In py_entitymatching, `select_matcher` command addresses the above needs. An
23 | example of using `select_matcher` is shown below:
24 | 
25 |     >>> dt = em.DTMatcher()
26 |     >>> rf = em.RFMatcher()
27 |     >>> result = em.select_matcher(matchers=[dt, rf], table=train, exclude_attrs=['_id', 'ltable_id', 'rtable_id'], target_attr='gold_labels', k=5)
28 | 
29 | In the above the output, `result` is a dictionary containing three keys: (1) selected_matcher,
30 | (2) cv_stats, and (3) drill_down_cv_stats. `selected_matcher` is the selected ML-based matcher,
31 | `cv_stats` is a Dataframe which includes the average cross validation scores for each matcher
32 | and for each metric, and 'drill_down_cv_stats' is a dictionary where each key is a metric that
33 | includes the cross validation statistics for each fold.
34 | 
35 | Please refer to the API reference of :py:meth:`~py_entitymatching.select_matcher` for
36 | more details.
37 | 


--------------------------------------------------------------------------------
/docs/user_manual/split_train_test.rst:
--------------------------------------------------------------------------------
 1 | =====================================================
 2 | Splitting Labeled Data into Training and Testing Sets
 3 | =====================================================
 4 | While doing entity matching you will have to split data for
 5 | multiple purposes. Some examples are:
 6 | 
 7 | 1. Split labeled data into development and test. Th development
 8 | set is used to come up with right features for learning-based matcher, and
 9 | `test` set is used to evaluate the matcher.
10 | 
11 | 2. Split feature vectors into a train and test set. The train
12 | set is used to train the learning-based matcher and test set is used
13 | for evaluation.
14 | 
15 | 
16 | py_entitymatching provides `split_train_test` command for the above need.
17 | An example of using `split_train_test` is shown below:
18 | 
19 |     >>> train_test = em.split_train_test(G, train_proportion=0.5)
20 | 
21 | In the above, `split_train_test` returns a dictionary with two keys: train, and test.
22 | The value for the key `train` is a Dataframe containing tuples
23 | allocated from the input table based on train_proportion.
24 | Similarly, the value for the key `test` is a Dataframe containing
25 | tuples for evaluation. An example of getting train and test Dataframes from the output
26 | of `split_train_test` command is shown below:
27 | 
28 | 
29 |     >>> devel_set = train_test['train']
30 |     >>> eval_set = train_test['test']
31 | 
32 | Setting the value for train proportion would depend on the
33 | context of its use. For instance, if the data is split for machine learning
34 | purposes then train proportion is typically larger than the
35 | test.
36 | The most commonly used values of train_proportion are between
37 | 0.5 and 0.8.
38 | 
39 | Please refer to the API reference of :py:meth:`~py_entitymatching.split_train_test` for
40 | more details.
41 | 
42 | 


--------------------------------------------------------------------------------
/docs/user_manual/steps_supp_em_workflows.rst:
--------------------------------------------------------------------------------
 1 | Steps of Supported EM Workflows
 2 | ===============================
 3 | .. toctree::
 4 |     :maxdepth: 3
 5 | 
 6 | 
 7 |     read_csv_files
 8 |     down_sampling
 9 |     profiling
10 |     data_exploration
11 |     blocking
12 |     create_feats_for_blocking
13 |     debugging_blocking
14 |     sampling
15 |     labeling
16 |     split_train_test
17 |     create_feats_for_matching
18 |     extract_feat_vecs
19 |     imputing_missing_values
20 |     matching
21 |     select_best_matcher
22 |     debugging_matcher
23 |     matchercombiner
24 |     triggers
25 |     evaluate_matching
26 | 


--------------------------------------------------------------------------------
/docs/user_manual/whatisnew.rst:
--------------------------------------------------------------------------------
 1 | What is New?
 2 | ============
 3 | 
 4 | Compared to Version 0.3.3, the followings are new:
 5 |   * Dropped support for Python 2 and 3.5.
 6 |   * To support Python 3.8, updated the function
 7 |     :code:`py_entitymatching.matcher.matcherutils.impute_table()` to use current
 8 |     scikit-learn's :code:`SimpleImputer`; see :ref:`Imputing Missing Values` for correct
 9 |     usage.
10 | 


--------------------------------------------------------------------------------
/notebooks/.ipynb_checkpoints/Untitled-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 0
6 | }
7 | 


--------------------------------------------------------------------------------
/notebooks/.ipynb_checkpoints/test_addfeature_py2-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 0
6 | }
7 | 


--------------------------------------------------------------------------------
/notebooks/.ipynb_checkpoints/test_attr_equiv_blocker-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 0
6 | }
7 | 


--------------------------------------------------------------------------------
/notebooks/.ipynb_checkpoints/test_autogenfeature_py3-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 0
6 | }
7 | 


--------------------------------------------------------------------------------
/notebooks/.ipynb_checkpoints/test_blackboxfunction-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 0
6 | }
7 | 


--------------------------------------------------------------------------------
/notebooks/.ipynb_checkpoints/test_blocker_combiner-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 0
6 | }
7 | 


--------------------------------------------------------------------------------
/notebooks/.ipynb_checkpoints/test_catalog-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 0
6 | }
7 | 


--------------------------------------------------------------------------------
/notebooks/.ipynb_checkpoints/test_combine_ids-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 0
6 | }
7 | 


--------------------------------------------------------------------------------
/notebooks/.ipynb_checkpoints/test_debug_matcher-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 0
6 | }
7 | 


--------------------------------------------------------------------------------
/notebooks/.ipynb_checkpoints/test_feature-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 0
6 | }
7 | 


--------------------------------------------------------------------------------
/notebooks/.ipynb_checkpoints/test_feature_add_features-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 0
6 | }
7 | 


--------------------------------------------------------------------------------
/notebooks/.ipynb_checkpoints/test_feature_attributeutils-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 0
6 | }
7 | 


--------------------------------------------------------------------------------
/notebooks/.ipynb_checkpoints/test_feature_extract_featurevecs-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 0
6 | }
7 | 


--------------------------------------------------------------------------------
/notebooks/.ipynb_checkpoints/test_feature_parse_string-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 0
6 | }
7 | 


--------------------------------------------------------------------------------
/notebooks/.ipynb_checkpoints/test_io-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 0
6 | }
7 | 


--------------------------------------------------------------------------------
/notebooks/.ipynb_checkpoints/test_kitchen-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 0
6 | }
7 | 


--------------------------------------------------------------------------------
/notebooks/.ipynb_checkpoints/test_labeling-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 0
6 | }
7 | 


--------------------------------------------------------------------------------
/notebooks/.ipynb_checkpoints/test_load_save-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 0
6 | }
7 | 


--------------------------------------------------------------------------------
/notebooks/.ipynb_checkpoints/test_overlapblocker-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 0
6 | }
7 | 


--------------------------------------------------------------------------------
/notebooks/.ipynb_checkpoints/test_projection-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 0
6 | }
7 | 


--------------------------------------------------------------------------------
/notebooks/.ipynb_checkpoints/test_rulebased_blocker-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 0
6 | }
7 | 


--------------------------------------------------------------------------------
/notebooks/.ipynb_checkpoints/test_sampling-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 0
6 | }
7 | 


--------------------------------------------------------------------------------
/notebooks/.ipynb_checkpoints/test_trtst_split-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 0
6 | }
7 | 


--------------------------------------------------------------------------------
/notebooks/guides/.ipynb_checkpoints/Adding Features to Feature Table-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 0
6 | }
7 | 


--------------------------------------------------------------------------------
/notebooks/guides/.ipynb_checkpoints/Combining Multiple Blockers-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 0
6 | }
7 | 


--------------------------------------------------------------------------------
/notebooks/guides/.ipynb_checkpoints/Debugging Blocker Output-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 0
6 | }
7 | 


--------------------------------------------------------------------------------
/notebooks/guides/.ipynb_checkpoints/Evaluating the Selected Matcher-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 0
6 | }
7 | 


--------------------------------------------------------------------------------
/notebooks/guides/.ipynb_checkpoints/Generating Features for Blocking Manually-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 0
6 | }
7 | 


--------------------------------------------------------------------------------
/notebooks/guides/.ipynb_checkpoints/Performing Blocking Using Blackbox Blocker-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 0
6 | }
7 | 


--------------------------------------------------------------------------------
/notebooks/guides/.ipynb_checkpoints/Performing Blocking Using Built-In Blockers (Attr. Equivalence Blocker)-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 0
6 | }
7 | 


--------------------------------------------------------------------------------
/notebooks/guides/.ipynb_checkpoints/Performing Blocking Using Built-In Blockers (Overlap Blocker)-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 0
6 | }
7 | 


--------------------------------------------------------------------------------
/notebooks/guides/.ipynb_checkpoints/Performing Blocking Using Rule-Based Blocking-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 0
6 | }
7 | 


--------------------------------------------------------------------------------
/notebooks/guides/.ipynb_checkpoints/Sampling and Labeling-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 0
6 | }
7 | 


--------------------------------------------------------------------------------
/notebooks/guides/.ipynb_checkpoints/Selecting the Best Learning Matcher-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 0
6 | }
7 | 


--------------------------------------------------------------------------------
/notebooks/guides/end_to_end_em_guides/.ipynb_checkpoints/Basic EM Workflow DBLP ACM-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 2
6 | }
7 | 


--------------------------------------------------------------------------------
/notebooks/guides/end_to_end_em_guides/.ipynb_checkpoints/Basic EM Workflow-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 2
6 | }
7 | 


--------------------------------------------------------------------------------
/notebooks/guides/end_to_end_em_guides/helper_functions.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import py_entitymatching as em
 3 | def get_missing_rows_in_candset(C, L, c_keys, l_keys):
 4 |     """
 5 |     Example usage:
 6 |     get_missing_rows_in_candset(C, L1, ['ltable_id', 'rtable_id'], ['fodors_id', 'zagats_id'])
 7 |     L1 is labeled data (with more attrs)
 8 |     """
 9 |     C1 = C[c_keys]
10 |     L1 = L[l_keys]
11 | 
12 |     d = dict()
13 |     for t in C1.itertuples(index=False):
14 |         d[(t[0], t[1])] = 1
15 | 
16 |     missing_tuples_in_C = []
17 |     for t in L1.itertuples(index=False):
18 |         if (t[0], t[1]) not in d:
19 |             missing_tuples_in_C.append(t)
20 | 
21 |     series_list = []
22 | 
23 |     for t in missing_tuples_in_C:
24 |         series_list.append((L[(L[l_keys[0]] == t[0]) & (L[l_keys[1]] == t[1])]))
25 | 
26 |     if len(series_list) == 0:
27 |         print('There are no missing tuples')
28 |     else:
29 |         return pd.concat(series_list)
30 | 
31 | def get_sampled_n_labeled_data(C, L, c_keys, l_keys, n, label_col, random_state=0):
32 |     """
33 |     Example usage:
34 |     get_sampled_n_labeled_data(C, L, ['ltable_id', 'rtable_id'], ['fodors_id', 'zagats_id'], 450, 'gold', random_state=0)
35 |     L is labeled data (with more attrs)
36 |     """
37 |     # C1 = C[c_keys]
38 |     # L1 = L[l_keys]
39 | 
40 |     d = dict()
41 |     for t in L[l_keys].itertuples(index=False):
42 |         d[(t[0], t[1])] = 1
43 | 
44 |     diff_tupes_in_C = []
45 |     for t in C[c_keys].itertuples(index=False):
46 |         if (t[0], t[1]) not in d:
47 |             diff_tupes_in_C.append(t)
48 | 
49 |     series_list = []
50 | 
51 |     for t in diff_tupes_in_C:
52 |         series_list.append((C[(C[c_keys[0]] == t[0]) & (C[c_keys[1]] == t[1])]))
53 | 
54 |     if len(series_list) == 0:
55 |         print('There are no diff tuples in C')
56 |     else:
57 |         neg_tuples =  pd.concat(series_list)
58 | 
59 |     # pos_tuples
60 |     pos_tuples_in_C = []
61 |     for t in C[c_keys].itertuples(index=False):
62 |         if (t[0], t[1]) in d:
63 |             pos_tuples_in_C.append(t)
64 | 
65 | 
66 |     series_list = []
67 | 
68 |     for t in pos_tuples_in_C:
69 |         series_list.append((C[(C[c_keys[0]] == t[0]) & (C[c_keys[1]] == t[1])]))
70 | 
71 |     if len(series_list) == 0:
72 |         print('There are no diff tuples in C')
73 |     else:
74 |         pos_tuples = pd.concat(series_list)
75 | 
76 |     neg_tuples = neg_tuples.sample(n-len(pos_tuples), random_state=random_state)
77 | 
78 |     pos_tuples[label_col] = 1
79 |     neg_tuples[label_col] = 0
80 |     concat_df =  pd.concat([pos_tuples, neg_tuples], ignore_index=True)
81 |     concat_df = concat_df.sample(frac=1).reset_index(drop=True)
82 |     em.copy_properties(C, concat_df)
83 |     return concat_df
84 | 


--------------------------------------------------------------------------------
/notebooks/guides/step_wise_em_guides/.ipynb_checkpoints/Performing Matching with a Rule-Based Matcher-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 2
6 | }
7 | 


--------------------------------------------------------------------------------
/notebooks/guides/step_wise_em_guides/.ipynb_checkpoints/Using Match Triggers to Improve Results-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 2
6 | }
7 | 


--------------------------------------------------------------------------------
/notebooks/vldb_demo/README:
--------------------------------------------------------------------------------
 1 | 1. Prerequisites: Python 2.7 or Python 3.4+
 2 | 
 3 | 2. First, Install the following packages:
 4 |    - py_entitymatching
 5 |    - seaborn
 6 |    - Jupyter notebook
 7 |    you can install the packages using conda like this:
 8 |    $ conda install py_entitymatching -c uwmagellan
 9 |    $ conda install seaborn jupyter
10 | 
11 | 3. Next, launch the Jupyter notebook from the current directory and open demo.  ipynb
12 |     NOTE: demo.ipynb assumes that the datafiles (*.csv) and helper scripts (profiler.py) are  present in the same directory (as demo.ipynb), 
13 |     so do not remove those files from the directory.
14 | 
15 | 4. Now you can run the cells in demo.ipynb to recreate the demo scenario.
16 | 


--------------------------------------------------------------------------------
/notebooks/vldb_demo/profiler.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import seaborn as sns
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | # def profile_table(df, attribute, plot=True):
 6 | #     out_df = pd.DataFrame(columns=['Property', 'Value'])
 7 | #     unique_values = pd.unique(df[attribute])
 8 | #     num_missing = sum(pd.isnull(df[attribute]))
 9 | #
10 | #     if not plot:
11 | #         out_df.set_value(0, 'Property', 'Num. Missing Values')
12 | #         out_df.set_value(0, 'Value', num_missing)
13 | #         out_df.set_value(1, 'Property', 'Num. Unique Values')
14 | #         out_df.set_value(1, 'Value', len(unique_values))
15 | #         out_df.set_value(2, 'Property', 'List of Unique Values')
16 | #         out_df.set_value(2, 'Value', sorted(list(unique_values)))
17 | #         return out_df
18 | #     else:
19 | #         print('Number of unique values: %d' % len(unique_values))
20 | #         print('Number of missing values: %d' % num_missing)
21 | #         print('\nUnique values: ')
22 | #         print(sorted(list(unique_values)))
23 | #         print('\nFrequency plot:\n')
24 | #
25 | #         d = (pd.DataFrame(df[attribute].value_counts()))
26 | #         d.sort_index(inplace=True)
27 | #         ax = sns.barplot(x="index", y=attribute, data=(
28 | #         pd.DataFrame(df[attribute].value_counts())).reset_index())
29 | #         ax.set(xlabel=attribute, ylabel='count')
30 | #         ax.grid(b=True, which='major', color='w', linewidth=1.0)
31 | #         ax.set_xticklabels(labels=d.index.values, rotation=90)
32 | #         plt.show()
33 | 
34 | 
35 | def profile_table(df, attribute, plot=True):
36 | 
37 |     unique_values = pd.unique(df[attribute])
38 |     num_missing = sum(pd.isnull(df[attribute]))
39 | 
40 |     if not plot:
41 |         return  pd.DataFrame({'Property':['Num. Missing Values', 'Num. Unique Values', 'List of Unique Values'],
42 |                                'Value':[num_missing, len(unique_values), sorted(list(unique_values))]})
43 |     else:
44 |         print('Number of unique values: %d\nNumber of missing values: '
45 |               '%d\n\nUnique values:'  % (len(unique_values), num_missing))
46 |         print(sorted(list(unique_values)))
47 |         print('\nFrequency plot:\n')
48 |         d = (pd.DataFrame(df[attribute].value_counts()))
49 |         ax = sns.barplot(x="index", y=attribute, data=(d).reset_index())
50 |         ax.set(xlabel=attribute, ylabel='count')
51 |         ax.grid(b=True, which='major', color='w', linewidth=1.0)
52 |         ax.set_xticklabels(labels=d.sort_index().index.values, rotation=90)
53 |         plt.show()
54 | 


--------------------------------------------------------------------------------
/py_entitymatching/blocker/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhaidgroup/py_entitymatching/0eebc5823a5faac89934c52cfe2b7cb5f6e15e8a/py_entitymatching/blocker/__init__.py


--------------------------------------------------------------------------------
/py_entitymatching/blockercombiner/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhaidgroup/py_entitymatching/0eebc5823a5faac89934c52cfe2b7cb5f6e15e8a/py_entitymatching/blockercombiner/__init__.py


--------------------------------------------------------------------------------
/py_entitymatching/catalog/__init__.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | 


--------------------------------------------------------------------------------
/py_entitymatching/dask/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhaidgroup/py_entitymatching/0eebc5823a5faac89934c52cfe2b7cb5f6e15e8a/py_entitymatching/dask/__init__.py


--------------------------------------------------------------------------------
/py_entitymatching/dask/dask_dtmatcher.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module contains the functions for Decision Tree learning-based matcher.
 3 | """
 4 | # from py_entitymatching.matcher.mlmatcher import MLMatcher
 5 | import logging
 6 | 
 7 | from py_entitymatching.dask.daskmlmatcher import DaskMLMatcher
 8 | from py_entitymatching.matcher.matcherutils import get_ts
 9 | 
10 | logger = logging.getLogger(__name__)
11 | from sklearn.tree import DecisionTreeClassifier
12 | 
13 | 
14 | class DaskDTMatcher(DaskMLMatcher):
15 |     """
16 |     WARNING THIS MATCHER IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.
17 | 
18 |     Decision Tree matcher.
19 | 
20 |     Args:
21 |         *args,**kwargs: The arguments to scikit-learn's Decision Tree
22 |             classifier.
23 |         name (string): The name of this matcher (defaults to None). If the
24 |             matcher name is None, the class automatically generates a string
25 |             and assigns it as the name.
26 | 
27 | 
28 |     """
29 | 
30 |     def __init__(self, *args, **kwargs):
31 |         logger.warning(
32 |             "WARNING THIS MATCHER IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.")
33 | 
34 |         super(DaskDTMatcher, self).__init__()
35 |         # If the name is given, then pop it
36 |         name = kwargs.pop('name', None)
37 |         if name is None:
38 |             # If the name of the matcher is give, then create one.
39 |             # Currently, we use a constant string + a random number.
40 |             self.name = 'DecisionTree' + '_' + get_ts()
41 |         else:
42 |             # Set the name of the matcher, with the given name.
43 |             self.name = name
44 |         # Set the classifier to the scikit-learn classifier.
45 |         self.clf = DecisionTreeClassifier(*args, **kwargs)


--------------------------------------------------------------------------------
/py_entitymatching/dask/dask_logregmatcher.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module contains the functions for Logistic Regression classifier.
 3 | """
 4 | # from py_entitymatching.matcher.mlmatcher import MLMatcher
 5 | from py_entitymatching.dask.daskmlmatcher import DaskMLMatcher
 6 | from sklearn.linear_model import LogisticRegression
 7 | from py_entitymatching.matcher.matcherutils import get_ts
 8 | 
 9 | 
10 | class DaskLogRegMatcher(DaskMLMatcher):
11 |     """
12 |     WARNING THIS MATCHER IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.
13 | 
14 |     Logistic Regression matcher.
15 | 
16 |     Args:
17 |         *args,**kwargs: THe Arguments to scikit-learn's Logistic Regression
18 |             classifier.
19 |         name (string): The name of this matcher (defaults to None). If the
20 |             matcher name is None, the class automatically generates a string
21 |             and assigns it as the name.
22 | 
23 | 
24 |     """
25 | 
26 |     def __init__(self, *args, **kwargs):
27 |         logger.warning(
28 |             "WARNING THIS MATCHER IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.")
29 | 
30 |         # If the name is given, then pop it
31 |         name = kwargs.pop('name', None)
32 |         if name is None:
33 |             # If the name of the matcher is give, then create one.
34 |             # Currently, we use a constant string + a random number.
35 |             self.name = 'LogisticRegression' + '_' + get_ts()
36 |         else:
37 |             # Set the name of the matcher, with the given name.
38 |             self.name = name
39 |         super(LogRegMatcher, self).__init__()
40 |         # Set the classifier to the scikit-learn classifier.
41 |         self.clf = LogisticRegression(*args, **kwargs)
42 |         self.clf.classes_ = [0, 1]


--------------------------------------------------------------------------------
/py_entitymatching/dask/dask_nbmatcher.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module contains the functions for Naive Bayes classifier.
 3 | """
 4 | 
 5 | # from py_entitymatching.matcher.mlmatcher import MLMatcher
 6 | from py_entitymatching.dask.daskmlmatcher import DaskMLMatcher
 7 | from py_entitymatching.matcher.matcherutils import get_ts
 8 | 
 9 | from sklearn.naive_bayes import GaussianNB
10 | 
11 | class DaskNBMatcher(DaskMLMatcher):
12 |     """
13 |     WARNING THIS MATCHER IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.
14 | 
15 |     Naive Bayes matcher.
16 | 
17 |     Args:
18 |         *args,**kwargs: The arguments to scikit-learn's Naive Bayes
19 |              classifier.
20 | 
21 |         name (string): The name of this matcher (defaults to None). If the
22 |             matcher name is None, the class automatically generates a string
23 |             and assigns it as the name.
24 | 
25 | 
26 |     """
27 |     def __init__(self, *args, **kwargs):
28 |         logger.warning(
29 |             "WARNING THIS MATCHER IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.")
30 |         # If the name is given, then pop it
31 |         name = kwargs.pop('name', None)
32 |         if name is None:
33 |             # If the name of the matcher is give, then create one.
34 |             # Currently, we use a constant string + a random number.
35 |             self.name = 'NaiveBayes'+ '_' + get_ts()
36 |         else:
37 |             # Set the name of the matcher, with the given name.
38 |             self.name = name
39 |         super(DaskNBMatcher, self).__init__()
40 |         # Set the classifier to the scikit-learn classifier.
41 |         self.clf = GaussianNB(*args, **kwargs)


--------------------------------------------------------------------------------
/py_entitymatching/dask/dask_rfmatcher.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module contains the functions for Random Forest classifier.
 3 | """
 4 | 
 5 | # from py_entitymatching.matcher.mlmatcher import MLMatcher
 6 | from py_entitymatching.dask.daskmlmatcher import DaskMLMatcher
 7 | from py_entitymatching.matcher.matcherutils import get_ts
 8 | 
 9 | from sklearn.ensemble import RandomForestClassifier
10 | 
11 | 
12 | class DaskRFMatcher(DaskMLMatcher):
13 |     """
14 |     WARNING THIS MATCHER IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.
15 | 
16 |     Random Forest matcher.
17 | 
18 |     Args:
19 |         *args,**kwargs: The arguments to scikit-learn's Random Forest
20 |          classifier.
21 | 
22 |         name (string): The name of this matcher (defaults to None). If the
23 |             matcher name is None, the class automatically generates a string
24 |             and assigns it as the name.
25 | 
26 | 
27 |     """
28 | 
29 |     def __init__(self, *args, **kwargs):
30 |         logger.warning(
31 |             "WARNING THIS MATCHER IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.")
32 | 
33 |         super(DaskRFMatcher, self).__init__()
34 |         # If the name is given, then pop it
35 |         name = kwargs.pop('name', None)
36 |         if name is None:
37 |             # If the name of the matcher is give, then create one.
38 |             # Currently, we use a constant string + a random number.
39 |             self.name = 'RandomForest' + '_' + get_ts()
40 |         else:
41 |             # Set the name of the matcher, with the given name.
42 |             self.name = name
43 |         # Set the classifier to the scikit-learn classifier.
44 |         self.clf = RandomForestClassifier(*args, **kwargs)


--------------------------------------------------------------------------------
/py_entitymatching/dask/dask_svm_matcher.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module contains the functions for SVM classifier.
 3 | 
 4 | """
 5 | # from py_entitymatching.matcher.mlmatcher import MLMatcher
 6 | from py_entitymatching.dask.daskmlmatcher import DaskMLMatcher
 7 | from py_entitymatching.matcher.matcherutils import get_ts
 8 | 
 9 | from sklearn.svm import SVC
10 | 
11 | 
12 | class DaskSVMMatcher(DaskMLMatcher):
13 |     """
14 |     WARNING THIS MATCHER IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.
15 | 
16 |     SVM matcher.
17 | 
18 |     Args:
19 |         *args,**kwargs: The arguments to scikit-learn's SVM
20 |             classifier.
21 |         name (string): The name of this matcher (defaults to None). If the
22 |              matcher name is None, the class automatically generates a string
23 |              and assigns it as the name.
24 | 
25 | 
26 |     """
27 | 
28 |     def __init__(self, *args, **kwargs):
29 |         logger.warning(
30 |             "WARNING THIS MATCHER IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.")
31 | 
32 |         super(DaskSVMMatcher, self).__init__()
33 |         # If the name is given, then pop it
34 |         name = kwargs.pop('name', None)
35 |         if name is None:
36 |             # If the name of the matcher is give, then create one.
37 |             # Currently, we use a constant string + a random number.
38 |             self.name = 'SVM' + '_' + get_ts()
39 |         else:
40 |             # Set the name of the matcher, with the given name.
41 |             self.name = name
42 |         # Set the classifier to the scikit-learn classifier.
43 |         self.clf = SVC(*args, **kwargs)


--------------------------------------------------------------------------------
/py_entitymatching/dask/dask_xgboost_matcher.py:
--------------------------------------------------------------------------------
 1 | # from py_entitymatching.matcher.mlmatcher import MLMatcher
 2 | from py_entitymatching.dask.daskmlmatcher import DaskMLMatcher
 3 | from py_entitymatching.matcher.matcherutils import get_ts
 4 | 
 5 | # from sklearn.svm import SVC
 6 | try:
 7 |     from xgboost.sklearn import XGBClassifier
 8 | except ImportError:
 9 |     raise ImportError('Check if xgboost library is installed. You can install xgboost '
10 |                       'by following the instructions at http://xgboost.readthedocs.io/en/latest/build.html')
11 | 
12 | 
13 | class DaskXGBoostMatcher(DaskMLMatcher):
14 |     """
15 |     WARNING THIS MATCHER IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK
16 | 
17 |     XGBoost matcher.
18 | 
19 |     Args:
20 |         *args,**kwargs: The arguments to XGBoost
21 |             classifier.
22 |         name (string): The name of this matcher (defaults to None). If the
23 |              matcher name is None, the class automatically generates a string
24 |              and assigns it as the name.
25 | 
26 | 
27 |     """
28 | 
29 |     def __init__(self, *args, **kwargs):
30 |         logger.warning(
31 |             "WARNING THIS MATCHER IS EXPERIMENTAL AND NOT TESTED. USE AT YOUR OWN RISK.")
32 | 
33 |         super(DaskXGBoostMatcher, self).__init__()
34 |         # If the name is given, then pop it
35 |         name = kwargs.pop('name', None)
36 |         if name is None:
37 |             # If the name of the matcher is give, then create one.
38 |             # Currently, we use a constant string + a random number.
39 |             self.name = 'xgboost' + '_' + get_ts()
40 |         else:
41 |             # Set the name of the matcher, with the given name.
42 |             self.name = name
43 |         # Set the classifier to the scikit-learn classifier.
44 |         try:
45 |             from xgboost.sklearn import XGBClassifier
46 |         except ImportError:
47 |             raise ImportError(
48 |                 'Check if xgboost library is installed. You can install xgboost '
49 |                 'by following the instructions at http://xgboost.readthedocs.io/en/latest/build.html')
50 |         self.clf = XGBClassifier(*args, **kwargs)
51 | 


--------------------------------------------------------------------------------
/py_entitymatching/dask/utils.py:
--------------------------------------------------------------------------------
 1 | import multiprocessing
 2 | 
 3 | def validate_chunks(n):
 4 |     if n == 0:
 5 |         raise AssertionError('The number of chunks cannot be 0 ')
 6 |     elif n <= -2:
 7 |         raise AssertionError('The number of chunks should be -1 or > 0')
 8 | 
 9 | def get_num_partitions(given_partitions, n):
10 |     if given_partitions == -1:
11 |         return multiprocessing.cpu_count()
12 |     elif given_partitions > n:
13 |         return n
14 |     else:
15 |         return given_partitions
16 | 
17 | def get_num_cores():
18 |     return multiprocessing.cpu_count()
19 | 
20 | def wrap(object):
21 |     return object


--------------------------------------------------------------------------------
/py_entitymatching/datasets/acm_demo.metadata:
--------------------------------------------------------------------------------
1 | #key=id
2 | 


--------------------------------------------------------------------------------
/py_entitymatching/datasets/end-to-end/acm_demo.metadata:
--------------------------------------------------------------------------------
1 | #key=id
2 | 


--------------------------------------------------------------------------------
/py_entitymatching/datasets/end-to-end/dblp_demo.metadata:
--------------------------------------------------------------------------------
1 | #key=id
2 | 


--------------------------------------------------------------------------------
/py_entitymatching/datasets/end-to-end/profiler.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import seaborn as sns
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | # def profile_table(df, attribute, plot=True):
 6 | #     out_df = pd.DataFrame(columns=['Property', 'Value'])
 7 | #     unique_values = pd.unique(df[attribute])
 8 | #     num_missing = sum(pd.isnull(df[attribute]))
 9 | #
10 | #     if not plot:
11 | #         out_df.set_value(0, 'Property', 'Num. Missing Values')
12 | #         out_df.set_value(0, 'Value', num_missing)
13 | #         out_df.set_value(1, 'Property', 'Num. Unique Values')
14 | #         out_df.set_value(1, 'Value', len(unique_values))
15 | #         out_df.set_value(2, 'Property', 'List of Unique Values')
16 | #         out_df.set_value(2, 'Value', sorted(list(unique_values)))
17 | #         return out_df
18 | #     else:
19 | #         print('Number of unique values: %d' % len(unique_values))
20 | #         print('Number of missing values: %d' % num_missing)
21 | #         print('\nUnique values: ')
22 | #         print(sorted(list(unique_values)))
23 | #         print('\nFrequency plot:\n')
24 | #
25 | #         d = (pd.DataFrame(df[attribute].value_counts()))
26 | #         d.sort_index(inplace=True)
27 | #         ax = sns.barplot(x="index", y=attribute, data=(
28 | #         pd.DataFrame(df[attribute].value_counts())).reset_index())
29 | #         ax.set(xlabel=attribute, ylabel='count')
30 | #         ax.grid(b=True, which='major', color='w', linewidth=1.0)
31 | #         ax.set_xticklabels(labels=d.index.values, rotation=90)
32 | #         plt.show()
33 | 
34 | 
35 | def profile_table(df, attribute, plot=True):
36 | 
37 |     unique_values = pd.unique(df[attribute])
38 |     num_missing = sum(pd.isnull(df[attribute]))
39 | 
40 |     if not plot:
41 |         return  pd.DataFrame({'Property':['Num. Missing Values', 'Num. Unique Values', 'List of Unique Values'],
42 |                                'Value':[num_missing, len(unique_values), sorted(list(unique_values))]})
43 |     else:
44 |         print('Number of unique values: %d\nNumber of missing values: '
45 |               '%d\n\nUnique values:'  % (len(unique_values), num_missing))
46 |         print(sorted(list(unique_values)))
47 |         print('\nFrequency plot:\n')
48 |         d = (pd.DataFrame(df[attribute].value_counts()))
49 |         ax = sns.barplot(x="index", y=attribute, data=(d).reset_index())
50 |         ax.set(xlabel=attribute, ylabel='count')
51 |         ax.grid(b=True, which='major', color='w', linewidth=1.0)
52 |         ax.set_xticklabels(labels=d.sort_index().index.values, rotation=90)
53 |         plt.show()
54 | 


--------------------------------------------------------------------------------
/py_entitymatching/datasets/end-to-end/restaurants/lbl_restnt_wf1.metadata:
--------------------------------------------------------------------------------
1 | #rtable=POINTER
2 | #fk_ltable=ltable_id
3 | #ltable=POINTER
4 | #key=_id
5 | #fk_rtable=rtable_id
6 | 


--------------------------------------------------------------------------------
/py_entitymatching/datasets/end-to-end/restaurants/match_fodors_zagats_more_attrs.metadata:
--------------------------------------------------------------------------------
1 | #key=_id
2 | #ltable=POINTER
3 | #rtable=POINTER
4 | #fk_rtable=zagats_id
5 | #fk_ltable=fodors_id
6 | 


--------------------------------------------------------------------------------
/py_entitymatching/datasets/end-to-end/restaurants/matches_fodors_zagats.csv:
--------------------------------------------------------------------------------
  1 | fodors_id,zagats_id
  2 | 534,219
  3 | 535,220
  4 | 536,221
  5 | 537,222
  6 | 538,223
  7 | 539,224
  8 | 540,225
  9 | 541,226
 10 | 542,227
 11 | 543,228
 12 | 544,229
 13 | 545,230
 14 | 546,231
 15 | 547,232
 16 | 548,233
 17 | 549,234
 18 | 550,235
 19 | 551,236
 20 | 552,237
 21 | 553,238
 22 | 554,239
 23 | 555,240
 24 | 556,241
 25 | 557,242
 26 | 558,243
 27 | 559,244
 28 | 560,245
 29 | 561,246
 30 | 562,247
 31 | 563,248
 32 | 564,249
 33 | 565,250
 34 | 566,251
 35 | 567,252
 36 | 568,253
 37 | 569,254
 38 | 570,255
 39 | 571,256
 40 | 572,257
 41 | 573,258
 42 | 574,259
 43 | 575,260
 44 | 576,261
 45 | 577,262
 46 | 578,263
 47 | 579,264
 48 | 580,265
 49 | 581,266
 50 | 582,267
 51 | 583,268
 52 | 584,269
 53 | 585,270
 54 | 586,271
 55 | 587,272
 56 | 588,273
 57 | 589,274
 58 | 590,275
 59 | 591,276
 60 | 592,277
 61 | 593,278
 62 | 594,279
 63 | 595,280
 64 | 596,281
 65 | 597,282
 66 | 598,283
 67 | 599,284
 68 | 600,285
 69 | 601,286
 70 | 602,287
 71 | 603,288
 72 | 604,289
 73 | 605,290
 74 | 606,291
 75 | 607,292
 76 | 608,293
 77 | 609,294
 78 | 610,295
 79 | 611,296
 80 | 612,297
 81 | 613,298
 82 | 614,299
 83 | 615,300
 84 | 616,301
 85 | 617,302
 86 | 618,303
 87 | 619,304
 88 | 620,305
 89 | 621,306
 90 | 622,307
 91 | 623,308
 92 | 624,309
 93 | 625,310
 94 | 626,311
 95 | 627,312
 96 | 628,313
 97 | 629,314
 98 | 630,315
 99 | 631,316
100 | 632,317
101 | 633,318
102 | 634,319
103 | 635,320
104 | 636,321
105 | 637,322
106 | 638,323
107 | 639,324
108 | 640,325
109 | 641,326
110 | 642,327
111 | 643,328
112 | 644,329
113 | 645,330
114 | 


--------------------------------------------------------------------------------
/py_entitymatching/datasets/person_table_A.csv:
--------------------------------------------------------------------------------
1 | ID,name,birth_year,hourly_wage,address,zipcode
2 | a1,Kevin Smith,1989,30,"607 From St, San Francisco",94107
3 | a2,Michael Franklin,1988,27.5,"1652 Stockton St, San Francisco",94122
4 | a3,William Bridge,1986,32,"3131 Webster St, San Francisco",94107
5 | a4,Binto George,1987,32.5,"423 Powell St, San Francisco",94122
6 | a5,Alphonse Kemper,1984,35,"1702 Post Street, San Francisco",94122


--------------------------------------------------------------------------------
/py_entitymatching/datasets/person_table_A.metadata:
--------------------------------------------------------------------------------
1 | #key=ID
2 | 


--------------------------------------------------------------------------------
/py_entitymatching/datasets/person_table_B.csv:
--------------------------------------------------------------------------------
1 | ID,name,birth_year,hourly_wage,address,zipcode
2 | b1,Mark Levene,1987,29.5,"108 Clement St, San Francisco",94107
3 | b2,Bill Bridge,1986,32,"3131 Webster St, San Francisco",94107
4 | b3,Mike Franklin,1988,27.5,"1652 Stockton St, San Francisco",94122
5 | b4,Joseph Kuan,1982,26,"108 South Park, San Francisco",94122
6 | b5,Alfons Kemper,1984,35,"170 Post St, Apt 4,  San Francisco",94122
7 | b6,Michael Brodie,1987,32.5,"133 Clement Street, San Francisco",94107


--------------------------------------------------------------------------------
/py_entitymatching/datasets/person_table_B.metadata:
--------------------------------------------------------------------------------
1 | #key=ID
2 | 


--------------------------------------------------------------------------------
/py_entitymatching/debugblocker/GenerateRecomLists.h:
--------------------------------------------------------------------------------
 1 | #ifndef TEST_GENERATERECOMLISTS_H
 2 | #define TEST_GENERATERECOMLISTS_H
 3 | 
 4 | #include <vector>
 5 | #include <string>
 6 | #include <map>
 7 | #include <set>
 8 | #include <iostream>
 9 | #include <algorithm>
10 | #include <stdio.h>
11 | #include "TopkHeader.h"
12 | using namespace std;
13 | 
14 | typedef map<int, set<int> > CandSet;
15 | typedef vector<vector<int> > Table;
16 | typedef map<pair<int, int>, int> TopkRankList;
17 | 
18 | double double_max(const double a, double b);
19 | 
20 | class RecPair {
21 | public:
22 |   int l_rec, r_rec, rank;
23 |   RecPair(int l_rec, int r_rec, int rank) : l_rec(l_rec), r_rec(r_rec), rank(rank){
24 | 
25 |   }
26 | };
27 | class GenerateRecomLists {
28 | public:
29 | 
30 |     Table generate_config(const vector<int>& field_list, const vector<int>& ltoken_sum_vector,
31 |                               const vector<int>& rtoken_sum_vector, const double field_remove_ratio,
32 |                               const unsigned int ltable_size, const unsigned int rtable_size);
33 |     Table sort_config(Table& config_lists);
34 | 
35 |     TopkRankList generate_topk_with_config(vector<int>& config, Table& ltoken_vector, Table& rtoken_vector,
36 |                                   Table& lindex_vector, Table& rindex_vector,
37 |                                   CandSet& cand_set, unsigned int output_size);
38 | 
39 |     vector<RecPair> generate_recom_lists(Table& ltoken_vector, Table& rtoken_vector,
40 |                               Table& lindex_vector, Table& rindex_vector,
41 |                               vector<int>& ltoken_sum_vector, vector<int>& rtoken_sum_vector, vector<int>& field_list,
42 |                               CandSet& cand_set, double field_remove_ratio,
43 |                               unsigned int output_size);
44 | 
45 |     vector<RecPair> merge_topk_lists(vector<TopkRankList >& rec_lists);
46 |     GenerateRecomLists();
47 |     ~GenerateRecomLists();
48 | };
49 | 
50 | 
51 | #endif //TEST_GENERATERECOMLISTS_H
52 | 


--------------------------------------------------------------------------------
/py_entitymatching/debugblocker/PrefixEvent.cpp:
--------------------------------------------------------------------------------
 1 | #include "PrefixEvent.h"
 2 | 
 3 | PrefixEvent::PrefixEvent() { }
 4 | 
 5 | PrefixEvent::PrefixEvent(double thres, int indicator, int rec, int tok) {
 6 |     threshold = thres;
 7 |     table_indicator = indicator;
 8 |     rec_idx = rec;
 9 |     tok_idx = tok;
10 | }
11 | 
12 | PrefixEvent::~PrefixEvent() { }
13 | 
14 | bool PrefixEvent::operator<(const PrefixEvent &other) const
15 | {
16 |  return threshold < other.threshold;
17 | }
18 | 
19 | bool PrefixEvent::operator>(const PrefixEvent &other) const
20 | {
21 |  return threshold > other.threshold;
22 | }
23 | 


--------------------------------------------------------------------------------
/py_entitymatching/debugblocker/PrefixEvent.h:
--------------------------------------------------------------------------------
 1 | class PrefixEvent {
 2 | public:
 3 |     double threshold;
 4 |     int table_indicator;
 5 |     int rec_idx;
 6 |     int tok_idx;
 7 |     
 8 |     PrefixEvent();
 9 |     PrefixEvent(double threshold, int table_indicator, int rec_idx, int tok_idx);
10 |     ~PrefixEvent();
11 | 
12 |     bool operator<(const PrefixEvent &other) const;
13 |     bool operator>(const PrefixEvent &other) const;
14 | };
15 | 


--------------------------------------------------------------------------------
/py_entitymatching/debugblocker/TopPair.cpp:
--------------------------------------------------------------------------------
 1 | #include "TopPair.h"
 2 | 
 3 | TopPair::TopPair() { }
 4 | 
 5 | TopPair::TopPair(double similarity, int l_rec_idx, int r_rec_idx) {
 6 |     sim = similarity;
 7 |     l_rec = l_rec_idx;
 8 |     r_rec = r_rec_idx;
 9 | }
10 | 
11 | TopPair::~TopPair() { }
12 | 
13 | bool TopPair::operator<(const TopPair &other) const
14 | {
15 |     return sim > other.sim;
16 | }
17 | 
18 | bool TopPair::operator>(const TopPair &other) const
19 | {
20 |     return sim < other.sim;
21 | }
22 | 


--------------------------------------------------------------------------------
/py_entitymatching/debugblocker/TopPair.h:
--------------------------------------------------------------------------------
 1 | #ifndef TEST_TOPPAIR_H
 2 | #define TEST_TOPPAIR_H
 3 | 
 4 | class TopPair {
 5 | public:
 6 |     double sim;
 7 |     int l_rec;
 8 |     int r_rec;
 9 | 
10 |     TopPair();
11 |     TopPair(double similarity, int l_rec_idx, int r_rec_idx);
12 |     ~TopPair();
13 | 
14 |     bool operator<(const TopPair &other) const;
15 |     bool operator>(const TopPair &other) const;
16 | };
17 | 
18 | 
19 | #endif //TEST_TOPPAIR_H
20 | 


--------------------------------------------------------------------------------
/py_entitymatching/debugblocker/TopkHeader.cpp:
--------------------------------------------------------------------------------
 1 | #include "TopkHeader.h"
 2 | 
 3 | 
 4 | void original_generate_prefix_events_impl(const Table& table, const int table_indicator,
 5 |                                           PrefixHeap& prefix_events) {
 6 |     for (unsigned int i = 0; i < table.size(); ++i) {
 7 |         unsigned long int length = table[i].size();
 8 |         if (length > 0) {
 9 |             for (unsigned int j = 0; j < length; ++j) {
10 |                 prefix_events.push(PrefixEvent(1.0 - j * 1.0 / length, table_indicator, i, j));
11 |             }
12 |         }
13 |     }
14 | }
15 | 
16 | void original_generate_prefix_events(const Table& ltable, const Table& rtable,
17 |                                      PrefixHeap& prefix_events) {
18 |     original_generate_prefix_events_impl(ltable, 0, prefix_events);
19 |     original_generate_prefix_events_impl(rtable, 1, prefix_events);
20 | }
21 | 
22 | 
23 | int original_plain_get_overlap(const vector<int>& ltoken_list, const vector<int>& rtoken_list) {
24 |     int overlap = 0;
25 |     set<int> rset;
26 | 
27 |     for (unsigned int i = 0; i < rtoken_list.size(); ++i) {
28 |         rset.insert(rtoken_list[i]);
29 |     }
30 | 
31 |     for (unsigned int i = 0; i < ltoken_list.size(); ++i) {
32 |         if (rset.count(ltoken_list[i])) {
33 |             ++overlap;
34 |         }
35 |     }
36 | 
37 |     return overlap;
38 | }
39 | 


--------------------------------------------------------------------------------
/py_entitymatching/debugblocker/TopkHeader.h:
--------------------------------------------------------------------------------
 1 | #ifndef __TOPKHEADER_H__
 2 | #define __TOPKHEADER_H__
 3 | 
 4 | #include <iostream>
 5 | #include <vector>
 6 | #include <set>
 7 | #include <map>
 8 | #include <queue>
 9 | #include <string>
10 | #include <cstdio>
11 | #include "TopPair.h"
12 | #include "PrefixEvent.h"
13 | 
14 | using namespace std;
15 | 
16 | typedef priority_queue<TopPair> Heap;
17 | typedef map<int, set<int> > CandSet;
18 | typedef map<int, set<pair<int, int> > > InvertedIndex;
19 | typedef vector<vector<int> > Table;
20 | typedef priority_queue<PrefixEvent> PrefixHeap;
21 | 
22 | 
23 | Heap original_topk_sim_join_plain(const Table& ltoken_vector, const Table& rtoken_vector,
24 |                                   CandSet& cand_set, const unsigned int output_size);
25 | 
26 | 
27 | int original_plain_get_overlap(const vector<int>& ltoken_list, const vector<int>& rtoken_list);
28 | 
29 | 
30 | void original_generate_prefix_events_impl(const Table& table, const int table_indicator,
31 |                                           PrefixHeap& prefix_events);
32 | 
33 | void original_generate_prefix_events(const Table& ltable, const Table& rtable,
34 |                                      PrefixHeap& prefix_events);
35 | 
36 | #endif //__TOPKHEADER_H__
37 | 


--------------------------------------------------------------------------------
/py_entitymatching/debugblocker/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhaidgroup/py_entitymatching/0eebc5823a5faac89934c52cfe2b7cb5f6e15e8a/py_entitymatching/debugblocker/__init__.py


--------------------------------------------------------------------------------
/py_entitymatching/debugmatcher/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/py_entitymatching/evaluation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhaidgroup/py_entitymatching/0eebc5823a5faac89934c52cfe2b7cb5f6e15e8a/py_entitymatching/evaluation/__init__.py


--------------------------------------------------------------------------------
/py_entitymatching/experimental/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhaidgroup/py_entitymatching/0eebc5823a5faac89934c52cfe2b7cb5f6e15e8a/py_entitymatching/experimental/__init__.py


--------------------------------------------------------------------------------
/py_entitymatching/explorer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhaidgroup/py_entitymatching/0eebc5823a5faac89934c52cfe2b7cb5f6e15e8a/py_entitymatching/explorer/__init__.py


--------------------------------------------------------------------------------
/py_entitymatching/explorer/openrefine/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhaidgroup/py_entitymatching/0eebc5823a5faac89934c52cfe2b7cb5f6e15e8a/py_entitymatching/explorer/openrefine/__init__.py


--------------------------------------------------------------------------------
/py_entitymatching/explorer/pandastable/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhaidgroup/py_entitymatching/0eebc5823a5faac89934c52cfe2b7cb5f6e15e8a/py_entitymatching/explorer/pandastable/__init__.py


--------------------------------------------------------------------------------
/py_entitymatching/explorer/pandastable/pandastable_wrapper.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     from tkinter import *
 3 | except ImportError as e:
 4 |     from Tkinter import *
 5 | 
 6 | from py_entitymatching.utils.validation_helper import validate_object_type
 7 | import pandas as pd
 8 | 
 9 | 
10 | def data_explore_pandastable(df):
11 |     """
12 |     Wrapper function for pandastable. Gives user a GUI to examine and edit
13 |     the dataframe passed in using pandastable.
14 | 
15 |     Args:
16 |         df (Dataframe): The pandas dataframe to be explored with pandastable.
17 | 
18 |     Raises:
19 |         AssertionError: If `df` is not of type pandas DataFrame.
20 | 
21 |     Examples:
22 |         >>> import py_entitymatching as em
23 |         >>> A = em.read_csv_metadata('path_to_csv_dir/table.csv', key='ID')
24 |         >>> em.data_explore_pandastable(A)
25 | 
26 |     """
27 | 
28 |     # Validate input parameters
29 |     # # We expect the df to be of type pandas DataFrame
30 |     validate_object_type(df, pd.DataFrame, 'Input df')
31 |     DataExplorePandastable(df)
32 | 
33 | 
34 | class DataExplorePandastable(Frame):
35 |     """
36 |     A wrapper for pandastable.
37 |     """
38 | 
39 |     def __init__(self, df):
40 |         # Import
41 |         try:
42 |             from pandastable import Table, TableModel
43 |         except ImportError:
44 |             raise ImportError('Pandastable is not installed. Please install pandastable to use '
45 |                               'pandastable data exploration functions.')
46 | 
47 |         self.parent = None
48 |         Frame.__init__(self)
49 |         self.main = self.master
50 |         self.main.geometry('600x400+200+100')
51 |         self.main.title('Explore Data')
52 |         f = Frame(self.main)
53 |         f.pack(fill=BOTH, expand=1)
54 |         # set the table in the GUI
55 |         self.table = pt = Table(f, dataframe=df,
56 |                                 showtoolbar=True, showstatusbar=True)
57 |         pt.show()
58 |         self.mainloop()
59 | 
60 | 


--------------------------------------------------------------------------------
/py_entitymatching/feature/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhaidgroup/py_entitymatching/0eebc5823a5faac89934c52cfe2b7cb5f6e15e8a/py_entitymatching/feature/__init__.py


--------------------------------------------------------------------------------
/py_entitymatching/gui/__init__.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | 


--------------------------------------------------------------------------------
/py_entitymatching/io/__init__.py:
--------------------------------------------------------------------------------
1 | from collections import OrderedDict
2 | import logging
3 | import os
4 | 
5 | 
6 | 


--------------------------------------------------------------------------------
/py_entitymatching/labeler/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhaidgroup/py_entitymatching/0eebc5823a5faac89934c52cfe2b7cb5f6e15e8a/py_entitymatching/labeler/__init__.py


--------------------------------------------------------------------------------
/py_entitymatching/matcher/__init__.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | 


--------------------------------------------------------------------------------
/py_entitymatching/matcher/dtmatcher.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module contains the functions for Decision Tree learning-based matcher.
 3 | """
 4 | from py_entitymatching.matcher.mlmatcher import MLMatcher
 5 | from py_entitymatching.matcher.matcherutils import get_ts
 6 | 
 7 | from sklearn.tree import DecisionTreeClassifier
 8 | 
 9 | class DTMatcher(MLMatcher):
10 |     """
11 |     Decision Tree matcher.
12 | 
13 |     Args:
14 |         *args,**kwargs: The arguments to scikit-learn's Decision Tree
15 |             classifier.
16 |         name (string): The name of this matcher (defaults to None). If the
17 |             matcher name is None, the class automatically generates a string
18 |             and assigns it as the name.
19 |     Notes:
20 |         For more details please see
21 | 
22 |     """
23 |     def __init__(self, *args, **kwargs):
24 |         super(DTMatcher, self).__init__()
25 |         # If the name is given, then pop it
26 |         name = kwargs.pop('name', None)
27 |         if name is None:
28 |             # If the name of the matcher is give, then create one.
29 |             # Currently, we use a constant string + a random number.
30 |             self.name = 'DecisionTree' + '_' + get_ts()
31 |         else:
32 |             # Set the name of the matcher, with the given name.
33 |             self.name = name
34 |         # Set the classifier to the scikit-learn classifier.
35 |         self.clf = DecisionTreeClassifier(*args, **kwargs)
36 | 


--------------------------------------------------------------------------------
/py_entitymatching/matcher/ensemblematcher.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module contains functions for ensembe matcher.
 3 | Note: This is not going to be there in the first version of py_entitymatching.
 4 | """
 5 | 
 6 | import pandas as pd
 7 | import numpy as np
 8 | import six
 9 | 
10 | from sklearn.base import BaseEstimator
11 | from sklearn.base import ClassifierMixin
12 | from sklearn.base import TransformerMixin
13 | from sklearn.base import clone
14 | from sklearn.pipeline import _name_estimators
15 | 
16 | from py_entitymatching.matcher.mlmatcher import MLMatcher
17 | from py_entitymatching.matchercombiner.matchercombiner import MajorityVote, WeightedVote
18 | 
19 | class EnsembleSKLearn(BaseEstimator, ClassifierMixin, TransformerMixin):
20 |     def __init__(self, clfs, voting, weights=None, threshold=None):
21 |         self.clfs = clfs
22 |         self.named_clfs = {key:value for key,value in _name_estimators(clfs)}
23 |         self.voting=voting
24 |         if voting is 'weighted':
25 |             self.combiner=WeightedVote(weights=weights, threshold=threshold)
26 |         elif voting is 'majority':
27 |             self.combiner=MajorityVote()
28 |         else:
29 |             raise AttributeError('Unrecognized voting method')
30 | 
31 |     def fit(self, X, y):
32 |         self.clfs_ = []
33 |         for clf in self.clfs:
34 |             fitted_clf = clone(clf).fit(X, y)
35 |             self.clfs_.append(fitted_clf)
36 |         return self
37 | 
38 |     def predict(self, X):
39 |         return self._predict(X)
40 | 
41 |     def _predict(self, X):
42 |         """ Collect results from clf.predict calls. """
43 |         predictions =  np.asarray([clf.predict(X) for clf in self.clfs_]).T
44 |         predicted_labels = self.combiner.combine(predictions)
45 |         return predicted_labels
46 | 
47 |     def get_params(self, deep=True):
48 |         """ Return estimator parameter names for GridSearch support"""
49 |         if not deep:
50 |             return super(EnsembleSKLearn, self).get_params(deep=False)
51 |         else:
52 |             out = self.named_clfs.copy()
53 |             for name, step in six.iteritems(self.named_clfs):
54 |                 for key, value in six.iteritems(step.get_params(deep=True)):
55 |                     out['%s__%s' % (name, key)] = value
56 |             return out
57 | 
58 | class EnsembleMatcher(MLMatcher):
59 |     def __init__(self, matchers, name=None, voting='weighted', weights=None, threshold=None):
60 |         clfs = [m.clf for m in matchers]
61 |         self.clf = EnsembleSKLearn(clfs, voting, weights, threshold)
62 |         if name is None:
63 |             names = [matcher.get_name() for matcher in matchers ]
64 |             self.name = voting+':'
65 |             self.name += ','.join(names)
66 | 
67 |         else:
68 |             self.name = name
69 | 


--------------------------------------------------------------------------------
/py_entitymatching/matcher/logregmatcher.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module contains the functions for Logistic Regression classifier.
 3 | """
 4 | from py_entitymatching.matcher.mlmatcher import MLMatcher
 5 | from sklearn.linear_model import LogisticRegression
 6 | from py_entitymatching.matcher.matcherutils import get_ts
 7 | 
 8 | class LogRegMatcher(MLMatcher):
 9 |     """
10 |     Logistic Regression matcher.
11 | 
12 |     Args:
13 |         *args,**kwargs: THe Arguments to scikit-learn's Logistic Regression
14 |             classifier.
15 |         name (string): The name of this matcher (defaults to None). If the
16 |             matcher name is None, the class automatically generates a string
17 |             and assigns it as the name.
18 | 
19 | 
20 |     """
21 |     def __init__(self, *args, **kwargs):
22 |         # If the name is given, then pop it
23 |         name = kwargs.pop('name', None)
24 |         if name is None:
25 |             # If the name of the matcher is give, then create one.
26 |             # Currently, we use a constant string + a random number.
27 |             self.name = 'LogisticRegression'+ '_' + get_ts()
28 |         else:
29 |             # Set the name of the matcher, with the given name.
30 |             self.name = name
31 |         super(LogRegMatcher, self).__init__()
32 |         # Set the classifier to the scikit-learn classifier.
33 |         self.clf = LogisticRegression(*args, **kwargs)
34 |         self.clf.classes_ = [0, 1]


--------------------------------------------------------------------------------
/py_entitymatching/matcher/matcher.py:
--------------------------------------------------------------------------------
1 | """
2 | This module contains the definition for high level matcher class.
3 | """
4 | class Matcher(object):
5 |     pass
6 | 
7 | 


--------------------------------------------------------------------------------
/py_entitymatching/matcher/nbmatcher.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module contains the functions for Naive Bayes classifier.
 3 | """
 4 | 
 5 | from py_entitymatching.matcher.mlmatcher import MLMatcher
 6 | from py_entitymatching.matcher.matcherutils import get_ts
 7 | 
 8 | from sklearn.naive_bayes import GaussianNB
 9 | 
10 | class NBMatcher(MLMatcher):
11 |     """
12 |     Naive Bayes matcher.
13 | 
14 |     Args:
15 |         *args,**kwargs: The arguments to scikit-learn's Naive Bayes
16 |              classifier.
17 | 
18 |         name (string): The name of this matcher (defaults to None). If the
19 |             matcher name is None, the class automatically generates a string
20 |             and assigns it as the name.
21 | 
22 | 
23 |     """
24 |     def __init__(self, *args, **kwargs):
25 |         # If the name is given, then pop it
26 |         name = kwargs.pop('name', None)
27 |         if name is None:
28 |             # If the name of the matcher is give, then create one.
29 |             # Currently, we use a constant string + a random number.
30 |             self.name = 'NaiveBayes'+ '_' + get_ts()
31 |         else:
32 |             # Set the name of the matcher, with the given name.
33 |             self.name = name
34 |         super(NBMatcher, self).__init__()
35 |         # Set the classifier to the scikit-learn classifier.
36 |         self.clf = GaussianNB(*args, **kwargs)


--------------------------------------------------------------------------------
/py_entitymatching/matcher/rfmatcher.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module contains the functions for Random Forest classifier.
 3 | """
 4 | 
 5 | from py_entitymatching.matcher.mlmatcher import MLMatcher
 6 | from py_entitymatching.matcher.matcherutils import get_ts
 7 | 
 8 | from sklearn.ensemble import RandomForestClassifier
 9 | 
10 | class RFMatcher(MLMatcher):
11 |     """
12 |     Random Forest matcher.
13 | 
14 |     Args:
15 |         *args,**kwargs: The arguments to scikit-learn's Random Forest
16 |          classifier.
17 | 
18 |         name (string): The name of this matcher (defaults to None). If the
19 |             matcher name is None, the class automatically generates a string
20 |             and assigns it as the name.
21 | 
22 | 
23 |     """
24 |     def __init__(self, *args, **kwargs):
25 |         super(RFMatcher, self).__init__()
26 |         # If the name is given, then pop it
27 |         name = kwargs.pop('name', None)
28 |         if name is None:
29 |             # If the name of the matcher is give, then create one.
30 |             # Currently, we use a constant string + a random number.
31 |             self.name = 'RandomForest'+ '_' + get_ts()
32 |         else:
33 |             # Set the name of the matcher, with the given name.
34 |             self.name = name
35 |         # Set the classifier to the scikit-learn classifier.
36 |         self.clf = RandomForestClassifier(*args, **kwargs)
37 | 


--------------------------------------------------------------------------------
/py_entitymatching/matcher/rulematcher.py:
--------------------------------------------------------------------------------
1 | """
2 | This module contains functions for Rule Matcher.
3 | Note: This will not be included in the first version of py_entitymatching.
4 | """
5 | from py_entitymatching.matcher.matcher import Matcher
6 | 
7 | class RuleMatcher(Matcher):
8 |     pass
9 | 


--------------------------------------------------------------------------------
/py_entitymatching/matcher/svmmatcher.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module contains the functions for SVM classifier.
 3 | 
 4 | """
 5 | from py_entitymatching.matcher.mlmatcher import MLMatcher
 6 | from py_entitymatching.matcher.matcherutils import get_ts
 7 | from sklearn.svm import SVC
 8 | 
 9 | 
10 | class SVMMatcher(MLMatcher):
11 |     """
12 |     SVM matcher.
13 | 
14 |     Args:
15 |         *args,**kwargs: The arguments to scikit-learn's SVM
16 |             classifier.
17 |         name (string): The name of this matcher (defaults to None). If the
18 |              matcher name is None, the class automatically generates a string
19 |              and assigns it as the name.
20 | 
21 | 
22 |     """
23 |     def __init__(self, *args, **kwargs):
24 |         super(SVMMatcher, self).__init__()
25 |         # If the name is given, then pop it
26 |         name = kwargs.pop('name', None)
27 |         if name is None:
28 |             # If the name of the matcher is give, then create one.
29 |             # Currently, we use a constant string + a random number.
30 |             self.name = 'SVM'+ '_' + get_ts()
31 |         else:
32 |             # Set the name of the matcher, with the given name.
33 |             self.name = name
34 |         # Set the classifier to the scikit-learn classifier.
35 |         self.clf = SVC(*args, **kwargs)


--------------------------------------------------------------------------------
/py_entitymatching/matcher/xgboostmatcher.py:
--------------------------------------------------------------------------------
 1 | from py_entitymatching.matcher.mlmatcher import MLMatcher
 2 | from py_entitymatching.matcher.matcherutils import get_ts
 3 | # from sklearn.svm import SVC
 4 | try:
 5 |     from xgboost.sklearn import XGBClassifier
 6 | except ImportError:
 7 |     raise ImportError('Check if xgboost library is installed. You can install xgboost '
 8 |                       'by following the instructions at http://xgboost.readthedocs.io/en/latest/build.html')
 9 | 
10 | 
11 | class XGBoostMatcher(MLMatcher):
12 |     """
13 |     XGBoost matcher.
14 | 
15 |     Args:
16 |         *args,**kwargs: The arguments to XGBoost
17 |             classifier.
18 |         name (string): The name of this matcher (defaults to None). If the
19 |              matcher name is None, the class automatically generates a string
20 |              and assigns it as the name.
21 | 
22 | 
23 |     """
24 |     def __init__(self, *args, **kwargs):
25 |         super(XGBoostMatcher, self).__init__()
26 |         # If the name is given, then pop it
27 |         name = kwargs.pop('name', None)
28 |         if name is None:
29 |             # If the name of the matcher is give, then create one.
30 |             # Currently, we use a constant string + a random number.
31 |             self.name = 'xgboost'+ '_' + get_ts()
32 |         else:
33 |             # Set the name of the matcher, with the given name.
34 |             self.name = name
35 |         # Set the classifier to the scikit-learn classifier.
36 |         try:
37 |             from xgboost.sklearn import XGBClassifier
38 |         except ImportError:
39 |             raise ImportError(
40 |                 'Check if xgboost library is installed. You can install xgboost '
41 |                 'by following the instructions at http://xgboost.readthedocs.io/en/latest/build.html')
42 |         self.clf = XGBClassifier(*args, **kwargs)
43 | 
44 | 


--------------------------------------------------------------------------------
/py_entitymatching/matchercombiner/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhaidgroup/py_entitymatching/0eebc5823a5faac89934c52cfe2b7cb5f6e15e8a/py_entitymatching/matchercombiner/__init__.py


--------------------------------------------------------------------------------
/py_entitymatching/matcherselector/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhaidgroup/py_entitymatching/0eebc5823a5faac89934c52cfe2b7cb5f6e15e8a/py_entitymatching/matcherselector/__init__.py


--------------------------------------------------------------------------------
/py_entitymatching/matcherselector/mlmatchercombinerselection.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This module contains functions for ML-matcher combiner selection.
 3 | Note: This is not going to be there for the first release of py_entitymatching.
 4 | """
 5 | 
 6 | import itertools
 7 | import six
 8 | 
 9 | from py_entitymatching.matcherselector.mlmatcherselection import select_matcher
10 | from py_entitymatching.matcher.ensemblematcher import EnsembleMatcher
11 | 
12 | def selector_matcher_combiner(matchers, combiners, x=None, y=None, table=None, exclude_attrs=None, target_attr=None,
13 |                               weights=None, threshold=None, k=5):
14 |     if not isinstance(matchers, list):
15 |         matchers = [matchers]
16 |     if not isinstance(combiners, list):
17 |         combiners = [combiners]
18 |     matcher_list = get_matcher_list(matchers, combiners, weights, threshold)
19 |     return select_matcher(matcher_list, x=x,  y=y, table=table, exclude_attrs=exclude_attrs, target_attr=target_attr,
20 |                           k=k)
21 | def get_matcher_list(matchers, combiners, weights, threshold):
22 |     ensemble_len = range(2, len(matchers) + 1)
23 |     matcher_list = []
24 |     matcher_list.extend(matchers)
25 |     for l in ensemble_len:
26 |         iter_combns = itertools.combinations(six.moves.xrange(0,
27 |                                                               len(matchers)), l)
28 |         for ic in iter_combns:
29 |             for c in combiners:
30 |                 m = [matchers[i] for i in ic]
31 |                 if c is 'Weighted':
32 |                     em = EnsembleMatcher(m, voting=c, weights=weights, threshold=threshold)
33 |                 else:
34 |                     em = EnsembleMatcher(m, voting=c)
35 |                 matcher_list.append(em)
36 |     return matcher_list


--------------------------------------------------------------------------------
/py_entitymatching/sampler/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhaidgroup/py_entitymatching/0eebc5823a5faac89934c52cfe2b7cb5f6e15e8a/py_entitymatching/sampler/__init__.py


--------------------------------------------------------------------------------
/py_entitymatching/tests/__init__.py:
--------------------------------------------------------------------------------
1 | import sys
2 | #sys.path.append('/scratch/pradap/python-work/py_entitymatching')
3 | 
4 | 


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/A.csv:
--------------------------------------------------------------------------------
1 | ID,name,birth_year,hourly_wage,address,zipcode
2 | a1,Kevin Smith,1989,30.0,"607 From St, San Francisco",94107
3 | a2,Michael Franklin,1988,27.5,"1652 Stockton St, San Francisco",94122
4 | a3,William Bridge,1986,32.0,"3131 Webster St, San Francisco",94107
5 | a4,Binto George,1987,32.5,"423 Powell St, San Francisco",94122
6 | a5,Alphonse Kemper,1984,35.0,"1702 Post Street, San Francisco",94122
7 | 


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/A.metadata:
--------------------------------------------------------------------------------
1 | #key=ID
2 | 


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/B.csv:
--------------------------------------------------------------------------------
1 | ID,name,birth_year,hourly_wage,address,zipcode
2 | b1,Mark Levene,1987,29.5,"108 Clement St, San Francisco",94107
3 | b2,Bill Bridge,1986,32,"3131 Webster St, San Francisco",94107
4 | b3,Mike Franklin,1988,27.5,"1652 Stockton St, San Francisco",94122
5 | b4,Joseph Kuan,1982,26,"108 South Park, San Francisco",94122
6 | b5,Alfons Kemper,1984,35,"170 Post St, Apt 4,  San Francisco",94122
7 | b6,Michael Brodie,1987,32.5,"133 Clement Street, San Francisco",94107


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/C.csv:
--------------------------------------------------------------------------------
 1 | _id,ltable_ID,rtable_ID,ltable_name,ltable_zipcode,ltable_birth_year,rtable_name,rtable_zipcode,rtable_birth_year
 2 | 0,a1,b1,Kevin Smith,94107,1989,Mark Levene,94107,1987
 3 | 1,a1,b2,Kevin Smith,94107,1989,Bill Bridge,94107,1986
 4 | 2,a1,b6,Kevin Smith,94107,1989,Michael Brodie,94107,1987
 5 | 6,a2,b3,Michael Franklin,94122,1988,Mike Franklin,94122,1988
 6 | 7,a2,b4,Michael Franklin,94122,1988,Joseph Kuan,94122,1982
 7 | 8,a2,b5,Michael Franklin,94122,1988,Alfons Kemper,94122,1984
 8 | 3,a3,b1,William Bridge,94107,1986,Mark Levene,94107,1987
 9 | 4,a3,b2,William Bridge,94107,1986,Bill Bridge,94107,1986
10 | 5,a3,b6,William Bridge,94107,1986,Michael Brodie,94107,1987
11 | 9,a4,b3,Binto George,94122,1987,Mike Franklin,94122,1988
12 | 10,a4,b4,Binto George,94122,1987,Joseph Kuan,94122,1982
13 | 11,a4,b5,Binto George,94122,1987,Alfons Kemper,94122,1984
14 | 12,a5,b3,Alphonse Kemper,94122,1984,Mike Franklin,94122,1988
15 | 13,a5,b4,Alphonse Kemper,94122,1984,Joseph Kuan,94122,1982
16 | 14,a5,b5,Alphonse Kemper,94122,1984,Alfons Kemper,94122,1984
17 | 


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/C.metadata:
--------------------------------------------------------------------------------
1 | #fk_ltable=ltable_ID
2 | #ltable=POINTER
3 | #key=_id
4 | #fk_rtable=rtable_ID
5 | #rtable=POINTER
6 | 


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/C1.csv:
--------------------------------------------------------------------------------
 1 | _id,ltable_ID,rtable_ID,ltable_name,ltable_zipcode,ltable_birth_year,rtable_name,rtable_zipcode,rtable_birth_year
 2 | 0,a1,b1,Kevin Smith,94107,1989,Mark Levene,94107,1987
 3 | 1,a1,b2,Kevin Smith,94107,1989,Bill Bridge,94107,1986
 4 | 2,a1,b6,Kevin Smith,94107,1989,Michael Brodie,94107,1987
 5 | 6,a2,b3,Michael Franklin,94122,1988,Mike Franklin,94122,1988
 6 | 7,a2,b4,Michael Franklin,94122,1988,Joseph Kuan,94122,1982
 7 | 8,a2,b5,Michael Franklin,94122,1988,Alfons Kemper,94122,1984
 8 | 3,a3,b1,William Bridge,94107,1986,Mark Levene,94107,1987
 9 | 4,a3,b2,William Bridge,94107,1986,Bill Bridge,94107,1986
10 | 5,a3,b6,William Bridge,94107,1986,Michael Brodie,94107,1987
11 | 9,a4,b3,Binto George,94122,1987,Mike Franklin,94122,1988
12 | 10,a4,b4,Binto George,94122,1987,Joseph Kuan,94122,1982
13 | 11,a4,b5,Binto George,94122,1987,Alfons Kemper,94122,1984
14 | 12,a5,b3,Alphonse Kemper,94122,1984,Mike Franklin,94122,1988
15 | 13,a5,b4,Alphonse Kemper,94122,1984,Joseph Kuan,94122,1982


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/C1.metadata:
--------------------------------------------------------------------------------
1 | #key=_id


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/D.csv:
--------------------------------------------------------------------------------
 1 | _id,name,birth_year,hourly_wage,address,zipcode,label
 2 | 1,Kevin Smith ,1989 ,30.0,"607 From St, San Francisco" ,94107 ,Yes
 3 | 2,Michael Franklin,1988 ,27.5,"1652 Stockton St, San Francisco",94122 ,Not-Matched
 4 | 3,William Bridge ,1986 ,32.0 ,"3131 Webster St, San Francisco" ,94107 ,Yes
 5 | 4,Binto George ,1987 ,32.5 ,"423 Powell St, San Francisco" ,94122 ,Not-Matched
 6 | 5,Alphonse Kemper ,1984 ,35.0 ,"1702 Post Street, San Francisco",94122 ,Yes
 7 | 6,Kevin Smith ,1989 ,30.0 ,"607 From St, San Francisco" ,94107 ,Not-Matched
 8 | 7,Michael Franklin,1988 ,27.5 ,"1652 Stockton St, San Francisco",94122 ,Yes
 9 | 8,William Bridge ,1986 ,32.0 ,"3131 Webster St, San Francisco" ,94107 ,Not-Sure
10 | 9,Binto George ,1987 ,32.5 ,"423 Powell St, San Francisco" ,94122 ,Yes
11 | 10,Alphonse Kemper ,1984 ,35.0 ,"1702 Post Street, San Francisco",94122 ,Not-Labeled
12 | 11,Kevin Smith ,1989 ,30.0 ,"607 From St, San Francisco" ,94107 ,Yes
13 | 12,Michael Franklin,1988 ,27.5 ,"1652 Stockton St, San Francisco",94122 ,Not-Sure
14 | 13,William Bridge ,1986 ,32.0 ,"3131 Webster St, San Francisco" ,94107 ,Yes
15 | 14,Binto George ,1987 ,32.5 ,"423 Powell St, San Francisco" ,94122 ,Not-Sure


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/D.metadata:
--------------------------------------------------------------------------------
1 | #key=_id


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/blocker/table_A_wi_missing_vals.csv:
--------------------------------------------------------------------------------
1 | ID,name,birth_year,hourly_wage,address,zipcode
2 | a1,Kevin Smith,1989,30,"607 From St, San Francisco",
3 | a2,Michael Franklin,,27.5,"1652 Stockton St, San Francisco",94122
4 | a3,William Bridge,1986,32,"3131 Webster St, San Francisco",94107
5 | a4,,1987,32.5,"423 Powell St, San Francisco",
6 | a5,Alphonse Kemper,1984,35,,94122
7 | 


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/blocker/table_A_wi_missing_vals.metadata:
--------------------------------------------------------------------------------
1 | #key=ID
2 | 


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/blocker/table_B_wi_missing_vals.csv:
--------------------------------------------------------------------------------
1 | ID,name,birth_year,hourly_wage,address,zipcode
2 | b1,Mark Levene,1987,29.5,"108 Clement St, San Francisco",94107
3 | b2,Bill Bridge,1986,32,"3131 Webster St, San Francisco",
4 | b3,Mike Franklin,1988,27.5,"1652 Stockton St, San Francisco",94122
5 | b4,,1982,26,"108 South Park, San Francisco",
6 | b5,Alfons Kemper,1984,35,"170 Post St, Apt 4,  San Francisco",94122
7 | b6,Michael Brodie,1987,32.5,,94107
8 | 


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/blocker/table_B_wi_missing_vals.metadata:
--------------------------------------------------------------------------------
1 | #key=ID
2 | 


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/blockercombiner/C1.csv:
--------------------------------------------------------------------------------
 1 | _id,ltable_ID,rtable_ID,ltable_name,ltable_zipcode,ltable_birth_year,rtable_name,rtable_zipcode,rtable_birth_year
 2 | 3,a3,b1,William Bridge,94107,1986,Mark Levene,94107,1987
 3 | 4,a3,b2,William Bridge,94107,1986,Bill Bridge,94107,1986
 4 | 5,a3,b6,William Bridge,94107,1986,Michael Brodie,94107,1987
 5 | 6,a2,b3,Michael Franklin,94122,1988,Mike Franklin,94122,1988
 6 | 7,a2,b4,Michael Franklin,94122,1988,Joseph Kuan,94122,1982
 7 | 8,a2,b5,Michael Franklin,94122,1988,Alfons Kemper,94122,1984
 8 | 9,a4,b3,Binto George,94122,1987,Mike Franklin,94122,1988
 9 | 10,a4,b4,Binto George,94122,1987,Joseph Kuan,94122,1982
10 | 11,a4,b5,Binto George,94122,1987,Alfons Kemper,94122,1984
11 | 12,a5,b3,Alphonse Kemper,94122,1984,Mike Franklin,94122,1988
12 | 13,a5,b4,Alphonse Kemper,94122,1984,Joseph Kuan,94122,1982
13 | 14,a5,b5,Alphonse Kemper,94122,1984,Alfons Kemper,94122,1984
14 | 


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/blockercombiner/C1.metadata:
--------------------------------------------------------------------------------
1 | #ltable=POINTER
2 | #rtable=POINTER
3 | #fk_rtable=rtable_ID
4 | #fk_ltable=ltable_ID
5 | #key=_id
6 | 


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/blockercombiner/C1_ex_1.csv:
--------------------------------------------------------------------------------
 1 | _id,ltable_ID,rtable_ID,ltable_name,ltable_address,rtable_name,rtable_address
 2 | 0,a1,b1,Kevin Smith,"607 From St, San Francisco",Mark Levene,"108 Clement St, San Francisco"
 3 | 1,a1,b2,Kevin Smith,"607 From St, San Francisco",Bill Bridge,"3131 Webster St, San Francisco"
 4 | 2,a1,b6,Kevin Smith,"607 From St, San Francisco",Michael Brodie,"133 Clement Street, San Francisco"
 5 | 6,a2,b3,Michael Franklin,"1652 Stockton St, San Francisco",Mike Franklin,"1652 Stockton St, San Francisco"
 6 | 7,a2,b4,Michael Franklin,"1652 Stockton St, San Francisco",Joseph Kuan,"108 South Park, San Francisco"
 7 | 8,a2,b5,Michael Franklin,"1652 Stockton St, San Francisco",Alfons Kemper,"170 Post St, Apt 4,  San Francisco"
 8 | 3,a3,b1,William Bridge,"3131 Webster St, San Francisco",Mark Levene,"108 Clement St, San Francisco"
 9 | 4,a3,b2,William Bridge,"3131 Webster St, San Francisco",Bill Bridge,"3131 Webster St, San Francisco"
10 | 5,a3,b6,William Bridge,"3131 Webster St, San Francisco",Michael Brodie,"133 Clement Street, San Francisco"
11 | 9,a4,b3,Binto George,"423 Powell St, San Francisco",Mike Franklin,"1652 Stockton St, San Francisco"
12 | 10,a4,b4,Binto George,"423 Powell St, San Francisco",Joseph Kuan,"108 South Park, San Francisco"
13 | 11,a4,b5,Binto George,"423 Powell St, San Francisco",Alfons Kemper,"170 Post St, Apt 4,  San Francisco"
14 | 12,a5,b3,Alphonse Kemper,"1702 Post Street, San Francisco",Mike Franklin,"1652 Stockton St, San Francisco"
15 | 13,a5,b4,Alphonse Kemper,"1702 Post Street, San Francisco",Joseph Kuan,"108 South Park, San Francisco"
16 | 14,a5,b5,Alphonse Kemper,"1702 Post Street, San Francisco",Alfons Kemper,"170 Post St, Apt 4,  San Francisco"
17 | 


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/blockercombiner/C1_ex_1.metadata:
--------------------------------------------------------------------------------
1 | #fk_ltable=ltable_ID
2 | #ltable=POINTER
3 | #key=_id
4 | #fk_rtable=rtable_ID
5 | #rtable=POINTER
6 | 


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/blockercombiner/C2.csv:
--------------------------------------------------------------------------------
 1 | _id,ltable_ID,rtable_ID,ltable_name,ltable_zipcode,ltable_birth_year,rtable_name,rtable_zipcode,rtable_birth_year
 2 | 0,a1,b1,Kevin Smith,94107,1989,Mark Levene,94107,1987
 3 | 1,a1,b2,Kevin Smith,94107,1989,Bill Bridge,94107,1986
 4 | 2,a1,b6,Kevin Smith,94107,1989,Michael Brodie,94107,1987
 5 | 6,a2,b3,Michael Franklin,94122,1988,Mike Franklin,94122,1988
 6 | 7,a2,b4,Michael Franklin,94122,1988,Joseph Kuan,94122,1982
 7 | 8,a2,b5,Michael Franklin,94122,1988,Alfons Kemper,94122,1984
 8 | 9,a4,b3,Binto George,94122,1987,Mike Franklin,94122,1988
 9 | 10,a4,b4,Binto George,94122,1987,Joseph Kuan,94122,1982
10 | 11,a4,b5,Binto George,94122,1987,Alfons Kemper,94122,1984
11 | 12,a5,b3,Alphonse Kemper,94122,1984,Mike Franklin,94122,1988
12 | 13,a5,b4,Alphonse Kemper,94122,1984,Joseph Kuan,94122,1982
13 | 14,a5,b5,Alphonse Kemper,94122,1984,Alfons Kemper,94122,1984
14 | 


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/blockercombiner/C2.metadata:
--------------------------------------------------------------------------------
1 | #ltable=POINTER
2 | #rtable=POINTER
3 | #fk_rtable=rtable_ID
4 | #fk_ltable=ltable_ID
5 | #key=_id
6 | 


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/blockercombiner/C2_ex_1.csv:
--------------------------------------------------------------------------------
1 | _id,ltable_ID,rtable_ID,ltable_birth_year,ltable_zipcode,rtable_birth_year,rtable_zipcode
2 | 0,a2,b3,1988,94122,1988,94122
3 | 1,a3,b2,1986,94107,1986,94107
4 | 2,a4,b1,1987,94122,1987,94107
5 | 3,a4,b6,1987,94122,1987,94107
6 | 4,a5,b5,1984,94122,1984,94122
7 | 


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/blockercombiner/C2_ex_1.metadata:
--------------------------------------------------------------------------------
1 | #ltable=POINTER
2 | #rtable=POINTER
3 | #fk_rtable=rtable_ID
4 | #fk_ltable=ltable_ID
5 | #key=_id
6 | 


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/blockercombiner/C3.csv:
--------------------------------------------------------------------------------
 1 | _id,ltable_ID,rtable_ID,ltable_name,ltable_zipcode,ltable_birth_year,rtable_name,rtable_zipcode,rtable_birth_year
 2 | 0,a1,b1,Kevin Smith,94107,1989,Mark Levene,94107,1987
 3 | 1,a1,b2,Kevin Smith,94107,1989,Bill Bridge,94107,1986
 4 | 2,a1,b6,Kevin Smith,94107,1989,Michael Brodie,94107,1987
 5 | 3,a3,b1,William Bridge,94107,1986,Mark Levene,94107,1987
 6 | 4,a3,b2,William Bridge,94107,1986,Bill Bridge,94107,1986
 7 | 5,a3,b6,William Bridge,94107,1986,Michael Brodie,94107,1987
 8 | 9,a4,b3,Binto George,94122,1987,Mike Franklin,94122,1988
 9 | 10,a4,b4,Binto George,94122,1987,Joseph Kuan,94122,1982
10 | 11,a4,b5,Binto George,94122,1987,Alfons Kemper,94122,1984
11 | 12,a5,b3,Alphonse Kemper,94122,1984,Mike Franklin,94122,1988
12 | 13,a5,b4,Alphonse Kemper,94122,1984,Joseph Kuan,94122,1982
13 | 14,a5,b5,Alphonse Kemper,94122,1984,Alfons Kemper,94122,1984
14 | 


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/blockercombiner/C3.metadata:
--------------------------------------------------------------------------------
1 | #ltable=POINTER
2 | #rtable=POINTER
3 | #fk_rtable=rtable_ID
4 | #fk_ltable=ltable_ID
5 | #key=_id
6 | 


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/blockercombiner/C3_ex_2.csv:
--------------------------------------------------------------------------------
1 | _id,ltable_ID,rtable_ID,ltable_birth_year,ltable_zipcode,rtable_birth_year,rtable_zipcode
2 | 


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/blockercombiner/C3_ex_2.metadata:
--------------------------------------------------------------------------------
1 | #ltable=POINTER
2 | #rtable=POINTER
3 | #fk_rtable=rtable_ID
4 | #fk_ltable=ltable_ID
5 | #key=_id
6 | 


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/blockercombiner/C4_ex_1.csv:
--------------------------------------------------------------------------------
 1 | _id,l_ID,r_ID,l_name,l_address,r_name,r_address
 2 | 0,a1,b1,Kevin Smith,"607 From St, San Francisco",Mark Levene,"108 Clement St, San Francisco"
 3 | 1,a1,b2,Kevin Smith,"607 From St, San Francisco",Bill Bridge,"3131 Webster St, San Francisco"
 4 | 2,a1,b6,Kevin Smith,"607 From St, San Francisco",Michael Brodie,"133 Clement Street, San Francisco"
 5 | 3,a3,b1,William Bridge,"3131 Webster St, San Francisco",Mark Levene,"108 Clement St, San Francisco"
 6 | 4,a3,b2,William Bridge,"3131 Webster St, San Francisco",Bill Bridge,"3131 Webster St, San Francisco"
 7 | 5,a3,b6,William Bridge,"3131 Webster St, San Francisco",Michael Brodie,"133 Clement Street, San Francisco"
 8 | 6,a2,b3,Michael Franklin,"1652 Stockton St, San Francisco",Mike Franklin,"1652 Stockton St, San Francisco"
 9 | 7,a2,b4,Michael Franklin,"1652 Stockton St, San Francisco",Joseph Kuan,"108 South Park, San Francisco"
10 | 8,a2,b5,Michael Franklin,"1652 Stockton St, San Francisco",Alfons Kemper,"170 Post St, Apt 4,  San Francisco"
11 | 9,a4,b3,Binto George,"423 Powell St, San Francisco",Mike Franklin,"1652 Stockton St, San Francisco"
12 | 10,a4,b4,Binto George,"423 Powell St, San Francisco",Joseph Kuan,"108 South Park, San Francisco"
13 | 11,a4,b5,Binto George,"423 Powell St, San Francisco",Alfons Kemper,"170 Post St, Apt 4,  San Francisco"
14 | 12,a5,b3,Alphonse Kemper,"1702 Post Street, San Francisco",Mike Franklin,"1652 Stockton St, San Francisco"
15 | 13,a5,b4,Alphonse Kemper,"1702 Post Street, San Francisco",Joseph Kuan,"108 South Park, San Francisco"
16 | 14,a5,b5,Alphonse Kemper,"1702 Post Street, San Francisco",Alfons Kemper,"170 Post St, Apt 4,  San Francisco"
17 | 


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/blockercombiner/C4_ex_1.metadata:
--------------------------------------------------------------------------------
1 | #ltable=POINTER
2 | #rtable=POINTER
3 | #fk_rtable=r_ID
4 | #fk_ltable=l_ID
5 | #key=_id
6 | 


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/blockercombiner/C4_ex_2.csv:
--------------------------------------------------------------------------------
1 | _id,l_ID,r_ID,l_birth_year,l_zipcode,r_birth_year,r_zipcode
2 | 0,a2,b3,1988,94122,1988,94122
3 | 1,a3,b2,1986,94107,1986,94107
4 | 2,a4,b1,1987,94122,1987,94107
5 | 3,a4,b6,1987,94122,1987,94107
6 | 4,a5,b5,1984,94122,1984,94122
7 | 


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/blockercombiner/C4_ex_2.metadata:
--------------------------------------------------------------------------------
1 | #ltable=POINTER
2 | #rtable=POINTER
3 | #fk_rtable=r_ID
4 | #fk_ltable=l_ID
5 | #key=_id
6 | 


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/blockercombiner/C_ex_1.csv:
--------------------------------------------------------------------------------
 1 | _id,ltable_ID,rtable_ID,ltable_name,ltable_address,ltable_birth_year,ltable_zipcode,rtable_name,rtable_address,rtable_birth_year,rtable_zipcode
 2 | 0,a1,b1,Kevin Smith,"607 From St, San Francisco",1989,94107,Mark Levene,"108 Clement St, San Francisco",1987,94107
 3 | 1,a1,b2,Kevin Smith,"607 From St, San Francisco",1989,94107,Bill Bridge,"3131 Webster St, San Francisco",1986,94107
 4 | 2,a1,b6,Kevin Smith,"607 From St, San Francisco",1989,94107,Michael Brodie,"133 Clement Street, San Francisco",1987,94107
 5 | 3,a2,b3,Michael Franklin,"1652 Stockton St, San Francisco",1988,94122,Mike Franklin,"1652 Stockton St, San Francisco",1988,94122
 6 | 4,a2,b4,Michael Franklin,"1652 Stockton St, San Francisco",1988,94122,Joseph Kuan,"108 South Park, San Francisco",1982,94122
 7 | 5,a2,b5,Michael Franklin,"1652 Stockton St, San Francisco",1988,94122,Alfons Kemper,"170 Post St, Apt 4,  San Francisco",1984,94122
 8 | 6,a3,b1,William Bridge,"3131 Webster St, San Francisco",1986,94107,Mark Levene,"108 Clement St, San Francisco",1987,94107
 9 | 7,a3,b2,William Bridge,"3131 Webster St, San Francisco",1986,94107,Bill Bridge,"3131 Webster St, San Francisco",1986,94107
10 | 8,a3,b6,William Bridge,"3131 Webster St, San Francisco",1986,94107,Michael Brodie,"133 Clement Street, San Francisco",1987,94107
11 | 9,a4,b1,Binto George,"423 Powell St, San Francisco",1987,94122,Mark Levene,"108 Clement St, San Francisco",1987,94107
12 | 10,a4,b3,Binto George,"423 Powell St, San Francisco",1987,94122,Mike Franklin,"1652 Stockton St, San Francisco",1988,94122
13 | 11,a4,b4,Binto George,"423 Powell St, San Francisco",1987,94122,Joseph Kuan,"108 South Park, San Francisco",1982,94122
14 | 12,a4,b5,Binto George,"423 Powell St, San Francisco",1987,94122,Alfons Kemper,"170 Post St, Apt 4,  San Francisco",1984,94122
15 | 13,a4,b6,Binto George,"423 Powell St, San Francisco",1987,94122,Michael Brodie,"133 Clement Street, San Francisco",1987,94107
16 | 14,a5,b3,Alphonse Kemper,"1702 Post Street, San Francisco",1984,94122,Mike Franklin,"1652 Stockton St, San Francisco",1988,94122
17 | 15,a5,b4,Alphonse Kemper,"1702 Post Street, San Francisco",1984,94122,Joseph Kuan,"108 South Park, San Francisco",1982,94122
18 | 16,a5,b5,Alphonse Kemper,"1702 Post Street, San Francisco",1984,94122,Alfons Kemper,"170 Post St, Apt 4,  San Francisco",1984,94122
19 | 


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/blockercombiner/C_ex_1.metadata:
--------------------------------------------------------------------------------
1 | #ltable=POINTER
2 | #rtable=POINTER
3 | #fk_rtable=rtable_ID
4 | #fk_ltable=ltable_ID
5 | #key=_id
6 | 


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/blockercombiner/C_ex_2.csv:
--------------------------------------------------------------------------------
 1 | _id,ltable_ID,rtable_ID,ltable_name,ltable_address,ltable_birth_year,ltable_zipcode,rtable_name,rtable_address,rtable_birth_year,rtable_zipcode
 2 | 0,a1,b1,Kevin Smith,"607 From St, San Francisco",1989,94107,Mark Levene,"108 Clement St, San Francisco",1987,94107
 3 | 1,a1,b2,Kevin Smith,"607 From St, San Francisco",1989,94107,Bill Bridge,"3131 Webster St, San Francisco",1986,94107
 4 | 2,a1,b6,Kevin Smith,"607 From St, San Francisco",1989,94107,Michael Brodie,"133 Clement Street, San Francisco",1987,94107
 5 | 3,a2,b3,Michael Franklin,"1652 Stockton St, San Francisco",1988,94122,Mike Franklin,"1652 Stockton St, San Francisco",1988,94122
 6 | 4,a2,b4,Michael Franklin,"1652 Stockton St, San Francisco",1988,94122,Joseph Kuan,"108 South Park, San Francisco",1982,94122
 7 | 5,a2,b5,Michael Franklin,"1652 Stockton St, San Francisco",1988,94122,Alfons Kemper,"170 Post St, Apt 4,  San Francisco",1984,94122
 8 | 6,a3,b1,William Bridge,"3131 Webster St, San Francisco",1986,94107,Mark Levene,"108 Clement St, San Francisco",1987,94107
 9 | 7,a3,b2,William Bridge,"3131 Webster St, San Francisco",1986,94107,Bill Bridge,"3131 Webster St, San Francisco",1986,94107
10 | 8,a3,b6,William Bridge,"3131 Webster St, San Francisco",1986,94107,Michael Brodie,"133 Clement Street, San Francisco",1987,94107
11 | 9,a4,b3,Binto George,"423 Powell St, San Francisco",1987,94122,Mike Franklin,"1652 Stockton St, San Francisco",1988,94122
12 | 10,a4,b4,Binto George,"423 Powell St, San Francisco",1987,94122,Joseph Kuan,"108 South Park, San Francisco",1982,94122
13 | 11,a4,b5,Binto George,"423 Powell St, San Francisco",1987,94122,Alfons Kemper,"170 Post St, Apt 4,  San Francisco",1984,94122
14 | 12,a5,b3,Alphonse Kemper,"1702 Post Street, San Francisco",1984,94122,Mike Franklin,"1652 Stockton St, San Francisco",1988,94122
15 | 13,a5,b4,Alphonse Kemper,"1702 Post Street, San Francisco",1984,94122,Joseph Kuan,"108 South Park, San Francisco",1982,94122
16 | 14,a5,b5,Alphonse Kemper,"1702 Post Street, San Francisco",1984,94122,Alfons Kemper,"170 Post St, Apt 4,  San Francisco",1984,94122
17 | 


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/blockercombiner/C_ex_2.metadata:
--------------------------------------------------------------------------------
1 | #ltable=POINTER
2 | #rtable=POINTER
3 | #fk_rtable=rtable_ID
4 | #fk_ltable=ltable_ID
5 | #key=_id
6 | 


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/blockercombiner/C_ex_4.csv:
--------------------------------------------------------------------------------
 1 | _id,l_ID,r_ID,l_name,l_address,l_birth_year,l_zipcode,r_name,r_address,r_birth_year,r_zipcode
 2 | 0,a1,b1,Kevin Smith,"607 From St, San Francisco",1989,94107,Mark Levene,"108 Clement St, San Francisco",1987,94107
 3 | 1,a1,b2,Kevin Smith,"607 From St, San Francisco",1989,94107,Bill Bridge,"3131 Webster St, San Francisco",1986,94107
 4 | 2,a1,b6,Kevin Smith,"607 From St, San Francisco",1989,94107,Michael Brodie,"133 Clement Street, San Francisco",1987,94107
 5 | 3,a2,b3,Michael Franklin,"1652 Stockton St, San Francisco",1988,94122,Mike Franklin,"1652 Stockton St, San Francisco",1988,94122
 6 | 4,a2,b4,Michael Franklin,"1652 Stockton St, San Francisco",1988,94122,Joseph Kuan,"108 South Park, San Francisco",1982,94122
 7 | 5,a2,b5,Michael Franklin,"1652 Stockton St, San Francisco",1988,94122,Alfons Kemper,"170 Post St, Apt 4,  San Francisco",1984,94122
 8 | 6,a3,b1,William Bridge,"3131 Webster St, San Francisco",1986,94107,Mark Levene,"108 Clement St, San Francisco",1987,94107
 9 | 7,a3,b2,William Bridge,"3131 Webster St, San Francisco",1986,94107,Bill Bridge,"3131 Webster St, San Francisco",1986,94107
10 | 8,a3,b6,William Bridge,"3131 Webster St, San Francisco",1986,94107,Michael Brodie,"133 Clement Street, San Francisco",1987,94107
11 | 9,a4,b1,Binto George,"423 Powell St, San Francisco",1987,94122,Mark Levene,"108 Clement St, San Francisco",1987,94107
12 | 10,a4,b3,Binto George,"423 Powell St, San Francisco",1987,94122,Mike Franklin,"1652 Stockton St, San Francisco",1988,94122
13 | 11,a4,b4,Binto George,"423 Powell St, San Francisco",1987,94122,Joseph Kuan,"108 South Park, San Francisco",1982,94122
14 | 12,a4,b5,Binto George,"423 Powell St, San Francisco",1987,94122,Alfons Kemper,"170 Post St, Apt 4,  San Francisco",1984,94122
15 | 13,a4,b6,Binto George,"423 Powell St, San Francisco",1987,94122,Michael Brodie,"133 Clement Street, San Francisco",1987,94107
16 | 14,a5,b3,Alphonse Kemper,"1702 Post Street, San Francisco",1984,94122,Mike Franklin,"1652 Stockton St, San Francisco",1988,94122
17 | 15,a5,b4,Alphonse Kemper,"1702 Post Street, San Francisco",1984,94122,Joseph Kuan,"108 South Park, San Francisco",1982,94122
18 | 16,a5,b5,Alphonse Kemper,"1702 Post Street, San Francisco",1984,94122,Alfons Kemper,"170 Post St, Apt 4,  San Francisco",1984,94122
19 | 


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/blockercombiner/C_ex_4.metadata:
--------------------------------------------------------------------------------
1 | #ltable=POINTER
2 | #rtable=POINTER
3 | #fk_rtable=r_ID
4 | #fk_ltable=l_ID
5 | #key=_id
6 | 


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/catalog/A.metadata:
--------------------------------------------------------------------------------
1 | #key=ID
2 | 


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/catalog/A_dupid.csv:
--------------------------------------------------------------------------------
1 | ID,name,birth_year,hourly_wage,address,zipcode
2 | a1,Kevin Smith,1989,30,"607 From St, San Francisco",94107
3 | a2,Michael Franklin,1988,27.5,"1652 Stockton St, San Francisco",94122
4 | a3,William Bridge,1986,32,"3131 Webster St, San Francisco",94107
5 | a4,Binto George,1987,32.5,"423 Powell St, San Francisco",94122
6 | a5,Alphonse Kemper,1984,35,"1702 Post Street, San Francisco",94122
7 | a5,Alphonse Kemper,1984,35,"1702 Post Street, San Francisco",94122


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/catalog/A_inv_fk.csv:
--------------------------------------------------------------------------------
1 | ID,name,birth_year,hourly_wage,address,zipcode
2 | a1,Kevin Smith,1989,30,"607 From St, San Francisco",94107
3 | a2,Michael Franklin,1988,27.5,"1652 Stockton St, San Francisco",94122
4 | a3,William Bridge,1986,32,"3131 Webster St, San Francisco",94107
5 | 


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/catalog/A_mvals.csv:
--------------------------------------------------------------------------------
1 | ID,name,birth_year,hourly_wage,address,zipcode
2 | a1,Kevin Smith,1989,30,"607 From St, San Francisco",94107
3 | a2,Michael Franklin,1988,27.5,"1652 Stockton St, San Francisco",94122
4 | a3,William Bridge,1986,32,"3131 Webster St, San Francisco",94107
5 | a4,Binto George,1987,32.5,"423 Powell St, San Francisco",94122
6 | ,Alphonse Kemper,1984,35,"1702 Post Street, San Francisco",94122


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/debugblocker/test_debugblocker_13.metadata:
--------------------------------------------------------------------------------
1 | #ltable=POINTER
2 | #rtable=POINTER
3 | #fk_rtable=rtable_ID
4 | #fk_ltable=ltable_ID
5 | #key=_id
6 | 


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/debugblocker/test_debugblocker_13_out.csv:
--------------------------------------------------------------------------------
 1 | _id,ltable_ID,rtable_ID,ltable_name,ltable_address,rtable_name,rtable_address
 2 | 0,a2,b2,Michael Franklin,"1652 Stockton St, San Francisco",Bill Bridge,"3131 Webster St, San Francisco"
 3 | 1,a5,b6,Alphonse Kemper,"1702 Post Street, San Francisco",Michael Brodie,"133 Clement Street, San Francisco"
 4 | 2,a2,b6,Michael Franklin,"1652 Stockton St, San Francisco",Michael Brodie,"133 Clement Street, San Francisco"
 5 | 3,a4,b2,Binto George,"423 Powell St, San Francisco",Bill Bridge,"3131 Webster St, San Francisco"
 6 | 4,a1,b3,Kevin Smith,"607 From St, San Francisco",Mike Franklin,"1652 Stockton St, San Francisco"
 7 | 5,a4,b1,Binto George,"423 Powell St, San Francisco",Mark Levene,"108 Clement St, San Francisco"
 8 | 6,a2,b1,Michael Franklin,"1652 Stockton St, San Francisco",Mark Levene,"108 Clement St, San Francisco"
 9 | 7,a3,b3,William Bridge,"3131 Webster St, San Francisco",Mike Franklin,"1652 Stockton St, San Francisco"
10 | 8,a1,b5,Kevin Smith,"607 From St, San Francisco",Alfons Kemper,"170 Post St, Apt 4,  San Francisco"
11 | 9,a3,b5,William Bridge,"3131 Webster St, San Francisco",Alfons Kemper,"170 Post St, Apt 4,  San Francisco"
12 | 10,a1,b4,Kevin Smith,"607 From St, San Francisco",Joseph Kuan,"108 South Park, San Francisco"
13 | 11,a3,b4,William Bridge,"3131 Webster St, San Francisco",Joseph Kuan,"108 South Park, San Francisco"
14 | 12,a5,b2,Alphonse Kemper,"1702 Post Street, San Francisco",Bill Bridge,"3131 Webster St, San Francisco"
15 | 13,a4,b6,Binto George,"423 Powell St, San Francisco",Michael Brodie,"133 Clement Street, San Francisco"
16 | 14,a5,b1,Alphonse Kemper,"1702 Post Street, San Francisco",Mark Levene,"108 Clement St, San Francisco"


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/debugblocker/test_debugblocker_cand.csv:
--------------------------------------------------------------------------------
1 | _id,ltable_ID,rtable_book_id
2 | 


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/debugblocker/test_debugblocker_ltable.csv:
--------------------------------------------------------------------------------
1 | ID,title,author,publisher,price,desc,genre,year,lang
2 | 0,"intro to database","John Doe","ABC publisher",10.00,"introduction to database",,2010,
3 | 1,"data analysis","Jane Doe","BCD publisher",20.00,"introduction to data analysis",,2015,"ENG"
4 | 2,"Thinking in Java","Johnnie Doe",,10.00,"learn how to program in Java",,2000,"ENG"
5 | 


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/debugblocker/test_debugblocker_rtable.csv:
--------------------------------------------------------------------------------
1 | book_id,book_title,author,publisher,price,pub_year,language,description,book_genre
2 | "B001","introduction to data analysis","John Doe","ABC publisher"10.00,2015,English,"introduction to data analysis",
3 | "B002","Thinking in C","Jane Doe","BCD publisher",15.00,1990,,"learn programming in C++",
4 | "B003","A brief history of time","Stephen Hawking",,20.00,1988,English,"from Big Bang to black holes",
5 | 


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/debugblocker/test_get_tokenized_table_1.txt:
--------------------------------------------------------------------------------
1 | a1 kevin smith 1989 30 607 from st, san francisco 94107
2 | a2 michael franklin 1988 28 1652 stockton st, san francisco 94122
3 | a3 william bridge 1986 32 3131 webster st, san francisco 94107
4 | a4 binto george 1987 32 423 powell st, san francisco 94122
5 | a5 alphonse kemper 1984 35 1702 post street, san francisco 94122
6 | 


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/debugblocker/test_get_tokenized_table_2.txt:
--------------------------------------------------------------------------------
1 | b1 mark levene 30
2 | b2 bill bridge 32
3 | b3 mike franklin 28
4 | b4 joseph kuan 26
5 | b5 alfons kemper 35
6 | b6 michael brodie 32
7 | 


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/debugblocker/test_topk_sim_join_1_A.txt:
--------------------------------------------------------------------------------
1 | a1 kevin smith 1989 30 607 from st, san francisco 94107
2 | a2 michael franklin 1988 28 1652 stockton st, san francisco 94122
3 | a3 william bridge 1986 32 3131 webster st, san francisco 94107
4 | a4 binto george 1987 32 423 powell st, san francisco 94122
5 | a5 alphonse kemper 1984 35 1702 post street, san francisco 94122
6 | 


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/debugblocker/test_topk_sim_join_1_B.txt:
--------------------------------------------------------------------------------
1 | b1 mark levene 1987 30 108 clement st, san francisco 94107
2 | b2 bill bridge 1986 32 3131 webster st, san francisco 94107
3 | b3 mike franklin 1988 28 1652 stockton st, san francisco 94122
4 | b4 joseph kuan 1982 26 108 south park, san francisco 94122
5 | b5 alfons kemper 1984 35 170 post st, apt 4, san francisco 94122
6 | b6 michael brodie 1987 32 133 clement street, san francisco 94107
7 | 


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/debugblocker/test_topk_sim_join_1_C.txt:
--------------------------------------------------------------------------------
 1 | 0 0
 2 | 0 1
 3 | 0 5
 4 | 1 2
 5 | 1 3
 6 | 1 4
 7 | 2 0
 8 | 2 1
 9 | 2 5
10 | 3 2
11 | 3 3
12 | 3 4
13 | 4 2
14 | 4 3
15 | 4 4
16 | 


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/io/A.csv:
--------------------------------------------------------------------------------
1 | ID,name,birth_year,hourly_wage,address,zipcode
2 | a1,Kevin Smith,1989,30.0,"607 From St, San Francisco",94107
3 | a2,Michael Franklin,1988,27.5,"1652 Stockton St, San Francisco",94122
4 | a3,William Bridge,1986,32.0,"3131 Webster St, San Francisco",94107
5 | a4,Binto George,1987,32.5,"423 Powell St, San Francisco",94122
6 | a5,Alphonse Kemper,1984,35.0,"1702 Post Street, San Francisco",94122
7 | 


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/io/A.mdx:
--------------------------------------------------------------------------------
1 | #key=ID
2 | 


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/io/A_dupid.csv:
--------------------------------------------------------------------------------
1 | ID,name,birth_year,hourly_wage,address,zipcode
2 | a1,Kevin Smith,1989,30,"607 From St, San Francisco",94107
3 | a2,Michael Franklin,1988,27.5,"1652 Stockton St, San Francisco",94122
4 | a3,William Bridge,1986,32,"3131 Webster St, San Francisco",94107
5 | a4,Binto George,1987,32.5,"423 Powell St, San Francisco",94122
6 | a5,Alphonse Kemper,1984,35,"1702 Post Street, San Francisco",94122
7 | a5,Alphonse Kemper,1984,35,"1702 Post Street, San Francisco",94122


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/io/A_key_zipcode.csv:
--------------------------------------------------------------------------------
1 | ID,name,birth_year,hourly_wage,address,zipcode
2 | a1,Kevin Smith,1989,30.0,"607 From St, San Francisco",94107
3 | a2,Michael Franklin,1988,27.5,"1652 Stockton St, San Francisco",94122
4 | a3,William Bridge,1986,32.0,"3131 Webster St, San Francisco",94107
5 | a4,Binto George,1987,32.5,"423 Powell St, San Francisco",94122
6 | a5,Alphonse Kemper,1984,35.0,"1702 Post Street, San Francisco",94122
7 | 


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/io/A_key_zipcode.metadata:
--------------------------------------------------------------------------------
1 | #key=zipcode
2 | 


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/io/A_md_wrongformat.csv:
--------------------------------------------------------------------------------
1 | ID,name,birth_year,hourly_wage,address,zipcode
2 | a1,Kevin Smith,1989,30.0,"607 From St, San Francisco",94107
3 | a2,Michael Franklin,1988,27.5,"1652 Stockton St, San Francisco",94122
4 | a3,William Bridge,1986,32.0,"3131 Webster St, San Francisco",94107
5 | a4,Binto George,1987,32.5,"423 Powell St, San Francisco",94122
6 | a5,Alphonse Kemper,1984,35.0,"1702 Post Street, San Francisco",94122
7 | 


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/io/A_md_wrongformat.metadata:
--------------------------------------------------------------------------------
1 | #key=zipcode#=10
2 | %%10
3 | 


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/io/A_mvals.csv:
--------------------------------------------------------------------------------
1 | ID,name,birth_year,hourly_wage,address,zipcode
2 | a1,Kevin Smith,1989,30,"607 From St, San Francisco",94107
3 | a2,Michael Franklin,1988,27.5,"1652 Stockton St, San Francisco",94122
4 | a3,William Bridge,1986,32,"3131 Webster St, San Francisco",94107
5 | a4,Binto George,1987,32.5,"423 Powell St, San Francisco",94122
6 | ,Alphonse Kemper,1984,35,"1702 Post Street, San Francisco",94122


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/io/C_partialmeta.csv:
--------------------------------------------------------------------------------
 1 | _id,ltable_ID,rtable_ID,ltable_name,ltable_zipcode,ltable_birth_year,rtable_name,rtable_zipcode,rtable_birth_year
 2 | 0,a1,b1,Kevin Smith,94107,1989,Mark Levene,94107,1987
 3 | 1,a1,b2,Kevin Smith,94107,1989,Bill Bridge,94107,1986
 4 | 2,a1,b6,Kevin Smith,94107,1989,Michael Brodie,94107,1987
 5 | 3,a3,b1,William Bridge,94107,1986,Mark Levene,94107,1987
 6 | 4,a3,b2,William Bridge,94107,1986,Bill Bridge,94107,1986
 7 | 5,a3,b6,William Bridge,94107,1986,Michael Brodie,94107,1987
 8 | 6,a2,b3,Michael Franklin,94122,1988,Mike Franklin,94122,1988
 9 | 7,a2,b4,Michael Franklin,94122,1988,Joseph Kuan,94122,1982
10 | 8,a2,b5,Michael Franklin,94122,1988,Alfons Kemper,94122,1984
11 | 9,a4,b3,Binto George,94122,1987,Mike Franklin,94122,1988
12 | 10,a4,b4,Binto George,94122,1987,Joseph Kuan,94122,1982
13 | 11,a4,b5,Binto George,94122,1987,Alfons Kemper,94122,1984
14 | 12,a5,b3,Alphonse Kemper,94122,1984,Mike Franklin,94122,1988
15 | 13,a5,b4,Alphonse Kemper,94122,1984,Joseph Kuan,94122,1982
16 | 14,a5,b5,Alphonse Kemper,94122,1984,Alfons Kemper,94122,1984


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/io/C_partialmeta.metadata:
--------------------------------------------------------------------------------
1 | #ltable=POINTER
2 | #rtable=POINTER
3 | #key=_id
4 | #fk_rtable=rtable_ID
5 | 


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/io/InvalidMetadata1.csv:
--------------------------------------------------------------------------------
1 | ID,name,birth_year,hourly_wage,address,zipcode
2 | a1,Kevin Smith,1989,30.0,"607 From St, San Francisco",94107
3 | a2,Michael Franklin,1988,27.5,"1652 Stockton St, San Francisco",94122
4 | a3,William Bridge,1986,32.0,"3131 Webster St, San Francisco",94107
5 | a4,Binto George,1987,32.5,"423 Powell St, San Francisco",94122
6 | a5,Alphonse Kemper,1984,35.0,"1702 Post Street, San Francisco",94122
7 | 


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/io/InvalidMetadata1.metadata:
--------------------------------------------------------------------------------
1 | #key1=ID
2 | 


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/io/InvalidMetadata2.csv:
--------------------------------------------------------------------------------
1 | ID,name,birth_year,hourly_wage,address,zipcode
2 | a1,Kevin Smith,1989,30.0,"607 From St, San Francisco",94107
3 | a2,Michael Franklin,1988,27.5,"1652 Stockton St, San Francisco",94122
4 | a3,William Bridge,1986,32.0,"3131 Webster St, San Francisco",94107
5 | a4,Binto George,1987,32.5,"423 Powell St, San Francisco",94122
6 | a5,Alphonse Kemper,1984,35.0,"1702 Post Street, San Francisco",94122
7 | 


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/io/InvalidMetadata2.metadata:
--------------------------------------------------------------------------------
1 | #key=ID1
2 | 


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/io/expected_A.metadata:
--------------------------------------------------------------------------------
1 | #key=ID


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/io/expected_C.metadata:
--------------------------------------------------------------------------------
1 | #ltable=POINTER
2 | #rtable=POINTER
3 | #fk_rtable=rtable_ID
4 | #fk_ltable=ltable_ID
5 | #key=_id


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/matcherselector/feat_vecs.metadata:
--------------------------------------------------------------------------------
1 | #key=_id
2 | #fk_ltable=ltable.id
3 | #ltable=POINTER
4 | #rtable=POINTER
5 | #fk_rtable=rtable.id
6 | 


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/sandbox/A.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhaidgroup/py_entitymatching/0eebc5823a5faac89934c52cfe2b7cb5f6e15e8a/py_entitymatching/tests/test_datasets/sandbox/A.pkl


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_datasets/sandbox/A.pklmetadata:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhaidgroup/py_entitymatching/0eebc5823a5faac89934c52cfe2b7cb5f6e15e8a/py_entitymatching/tests/test_datasets/sandbox/A.pklmetadata


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_feature_attributeutils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | # from nose.tools import *
 3 | import unittest
 4 | import pandas as pd
 5 | import six
 6 | from .utils import raises
 7 | 
 8 | from py_entitymatching.utils.generic_helper import get_install_path
 9 | from py_entitymatching.io.parsers import read_csv_metadata
10 | from py_entitymatching.feature.simfunctions import get_sim_funs_for_matching
11 | from py_entitymatching.feature.tokenizers import get_tokenizers_for_matching
12 | from py_entitymatching.feature.autofeaturegen import get_features_for_matching
13 | from py_entitymatching.feature.attributeutils import get_attr_corres, get_attr_types, _get_type, _len_handle_nan
14 | 
15 | import py_entitymatching.catalog.catalog_manager as cm
16 | 
17 | datasets_path = os.sep.join([get_install_path(), 'tests', 'test_datasets'])
18 | path_a = os.sep.join([datasets_path, 'A.csv'])
19 | path_b = os.sep.join([datasets_path, 'B.csv'])
20 | 
21 | 
22 | class AttributeUtilsTestCases(unittest.TestCase):
23 |     def test_get_attr_types_valid(self):
24 |         A = read_csv_metadata(path_a)
25 |         x = get_attr_types(A)
26 | 
27 |     @raises(AssertionError)
28 |     def test_get_attr_types_invalid_df(self):
29 |         x = get_attr_types(None)
30 | 
31 |     def test_get_attr_corres_valid_1(self):
32 |         A = read_csv_metadata(path_a)
33 |         B = read_csv_metadata(path_b, key='ID')
34 |         ac = get_attr_corres(A, B)
35 |         for c in ac['corres']:
36 |             self.assertEqual(c[0], c[1])
37 | 
38 |         self.assertEqual(all(ac['ltable'] == A), True)
39 |         self.assertEqual(all(ac['rtable'] == B), True)
40 | 
41 |     def test_get_attr_corres_valid_2(self):
42 |         A = read_csv_metadata(path_a)
43 |         A['label'] = 0
44 |         B = read_csv_metadata(path_b, key='ID')
45 |         ac = get_attr_corres(A, B)
46 |         for c in ac['corres']:
47 |             self.assertEqual(c[0], c[1])
48 | 
49 |         self.assertEqual(all(ac['ltable'] == A), True)
50 |         self.assertEqual(all(ac['rtable'] == B), True)
51 | 
52 | 
53 |     @raises(AssertionError)
54 |     def test_get_attr_corres_invalid_df1(self):
55 |         ac = get_attr_corres(None, pd.DataFrame())
56 | 
57 |     @raises(AssertionError)
58 |     def test_get_attr_corres_invalid_df2(self):
59 |         ac = get_attr_corres(pd.DataFrame(), None)
60 | 
61 |     def test_get_type_valid(self):
62 |         A = read_csv_metadata(path_a)
63 |         t = _get_type(A['ID'])
64 |         self.assertEqual(t, 'str_eq_1w')
65 | 
66 |     @raises(AssertionError)
67 |     def test_get_type_invalid_series(self):
68 |         _get_type(None)
69 | 
70 | 
71 |     def test_get_type_empty_series(self):
72 |         t = _get_type(pd.Series())
73 |         self.assertEqual(t, 'un_determined')
74 | 
75 |     @raises(AssertionError)
76 |     def test_get_type_multiple_types(self):
77 |         A = read_csv_metadata(path_a)
78 |         A.loc[0, 'ID'] = 1000
79 |         t = _get_type(A['ID'])
80 | 
81 |     def test_get_type_valid_2(self):
82 |         A = read_csv_metadata(path_a)
83 |         A['temp'] = True
84 |         t = _get_type(A['temp'])
85 |         self.assertEqual(t, 'boolean')
86 | 
87 |     def test_get_type_valid_3(self):
88 |         A = read_csv_metadata(path_a)
89 |         A['temp'] = "This is a very very very very very very very very very very very very very long string"
90 |         t = _get_type(A['temp'])
91 |         self.assertEqual(t, "str_gt_10w")
92 | 
93 |     def test_len_handle_nan_invalid(self):
94 |         result = _len_handle_nan(None)
95 |         self.assertEqual(pd.isnull(result), True)
96 | 


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_feature_tokenizers.py:
--------------------------------------------------------------------------------
 1 | from functools import partial
 2 | # from nose.tools import *
 3 | import unittest
 4 | import pandas as pd
 5 | import numpy as np
 6 | import six
 7 | from .utils import raises
 8 | 
 9 | import py_entitymatching.feature.tokenizers as tok
10 | 
11 | class TokenizerTestCases(unittest.TestCase):
12 |     def test_get_global_tokenizers(self):
13 |         x = tok._global_tokenizers
14 | 
15 |     def test_get_tokenizers_for_blocking(self):
16 |         x = tok.get_tokenizers_for_blocking()
17 |         self.assertEqual(isinstance(x, dict), True)
18 |         input = 'data science'
19 |         for name, value in six.iteritems(x):
20 |             self.assertEqual(isinstance(value(input), list), True)
21 | 
22 |     @raises(AssertionError)
23 |     def test_get_tokenizers_for_blocking_invalid(self):
24 |         tok.get_tokenizers_for_blocking(None, None)
25 | 
26 |     def test_get_tokenizers_for_matching(self):
27 |         x = tok.get_tokenizers_for_matching()
28 |         self.assertEqual(isinstance(x, dict), True)
29 |         input = 'data science'
30 |         for name, value in six.iteritems(x):
31 |             self.assertEqual(isinstance(value(input), list), True)
32 | 
33 |     @raises(AssertionError)
34 |     def test_get_tokenizers_for_matching_invalid(self):
35 |         x = tok.get_tokenizers_for_matching(None, None)
36 | 
37 | 
38 |     @raises(AssertionError)
39 |     def test_get_single_arg_tokenizers_invalid_1(self):
40 |         tok._get_single_arg_tokenizers(None, None)
41 | 
42 | 
43 |     def test_get_single_arg_tokenizers_valid_2(self):
44 |         tok._get_single_arg_tokenizers(q=3, dlm_char=' ')
45 | 
46 |     def test_get_single_arg_tokenizers_valid_3(self):
47 |         tok._get_single_arg_tokenizers(q=[], dlm_char=[])
48 | 
49 |     def test_get_single_arg_tokenizers_valid_4(self):
50 |         tok._get_single_arg_tokenizers(q=None, dlm_char=[' '])
51 | 
52 |     def test_get_single_arg_tokenizers_valid_5(self):
53 |         tok._get_single_arg_tokenizers(q=3, dlm_char=None)
54 | 
55 |     def test_qgram_invalid(self):
56 |         x = tok._make_tok_qgram(3)
57 |         self.assertEqual(pd.isnull(x(np.NaN)), True)
58 | 
59 |     def test_qgram_delim(self):
60 |         x = tok._make_tok_delim(' ')
61 |         self.assertEqual(pd.isnull(x(np.NaN)), True)
62 | 
63 |     def test_tokqgram_valid(self):
64 |         x = tok.tok_qgram('data science', 3)
65 |         self.assertEqual(isinstance(x, list), True)
66 | 
67 |     def test_tokdelim_valid(self):
68 |         x = tok.tok_delim('data science', ' ')
69 |         self.assertEqual(isinstance(x, list), True)
70 |         self.assertEqual(len(x), 2)
71 | 
72 |     def test_tokqgram_invalid(self):
73 |         x = tok.tok_qgram(np.NaN, 3)
74 |         self.assertEqual(pd.isnull(x), True)
75 | 
76 |     def test_tokdelim_invalid(self):
77 |         x = tok.tok_delim(np.NaN, ' ')
78 |         self.assertEqual(pd.isnull(x), True)
79 | 


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_sampler_single_table.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | import os
 3 | # from nose.tools import *
 4 | import unittest
 5 | import pandas as pd
 6 | import six
 7 | from .utils import raises
 8 | 
 9 | from py_entitymatching.utils.generic_helper import get_install_path
10 | import py_entitymatching.catalog.catalog_manager as cm
11 | from py_entitymatching.io.parsers import read_csv_metadata
12 | from py_entitymatching.sampler.single_table import sample_table
13 | 
14 | datasets_path = os.sep.join([get_install_path(), 'tests', 'test_datasets'])
15 | path_a = os.sep.join([datasets_path, 'A.csv'])
16 | path_b = os.sep.join([datasets_path, 'B.csv'])
17 | path_c = os.sep.join([datasets_path, 'C.csv'])
18 | 
19 | class SamplerSingleTableTestCases(unittest.TestCase):
20 |     def test_sample_table_valid_1(self):
21 |         A = read_csv_metadata(path_a)
22 |         B = read_csv_metadata(path_b, key='ID')
23 |         C = read_csv_metadata(path_c, ltable=A, rtable=B)
24 |         D = sample_table(C, 10, False)
25 |         self.assertEqual(cm.get_all_properties(C), cm.get_all_properties(D))
26 |         self.assertEqual(len(D), 10)
27 | 
28 |     def test_sample_table_valid_2(self):
29 |         A = read_csv_metadata(path_a)
30 |         B = read_csv_metadata(path_b, key='ID')
31 |         C = read_csv_metadata(path_c, ltable=A, rtable=B)
32 |         D = sample_table(C, 10, True)
33 |         self.assertEqual(id(cm.get_ltable(D)), id(cm.get_ltable(C)))
34 |         self.assertEqual(id(cm.get_rtable(D)), id(cm.get_rtable(C)))
35 |         self.assertEqual(cm.get_fk_ltable(D), cm.get_fk_ltable(C))
36 |         self.assertEqual(cm.get_fk_rtable(D), cm.get_fk_rtable(C))
37 |         self.assertEqual(len(D), 10)
38 | 
39 |     @raises(AssertionError)
40 |     def test_sample_table_invalid_df(self):
41 |         A = read_csv_metadata(path_a)
42 |         B = read_csv_metadata(path_b, key='ID')
43 |         C = read_csv_metadata(path_c, ltable=A, rtable=B)
44 |         D = sample_table(None, 10, True)
45 |         # self.assertEqual(cm.get_all_properties(C), cm.get_all_properties(D))
46 |         # self.assertEqual(len(D), 10)
47 | 
48 |     @raises(AssertionError)
49 |     def test_sample_table_invalid_size(self):
50 |         A = read_csv_metadata(path_a)
51 |         B = read_csv_metadata(path_b, key='ID')
52 |         C = read_csv_metadata(path_c, ltable=A, rtable=B)
53 |         D = sample_table(C, len(C)+1, True)
54 | 
55 |     @raises(AssertionError)
56 |     def test_sample_table_invalid_df_sz0(self):
57 |         # A = read_csv_metadata(path_a)
58 |         # B = read_csv_metadata(path_b, key='ID')
59 |         # C = read_csv_metadata(path_c, ltable=A, rtable=B)
60 |         D = sample_table(pd.DataFrame(), 1, True)
61 | 


--------------------------------------------------------------------------------
/py_entitymatching/tests/test_validation_helper.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | import unittest
 4 | 
 5 | # from nose.tools import *
 6 | import pandas as pd
 7 | from .utils import raises
 8 | 
 9 | from py_entitymatching.utils import validation_helper as vh
10 | 
11 | 
12 | class ValidationHelperTestCases(unittest.TestCase):
13 |     def test_validate_object_type_with_valid_type(self):
14 |         vh.validate_object_type('ABC', str)
15 |         vh.validate_object_type(pd.DataFrame(), pd.DataFrame)
16 |         vh.validate_object_type(list(), list),
17 |         vh.validate_object_type(True, bool),
18 |         vh.validate_object_type(123, int),
19 |         vh.validate_object_type(dict(), dict)
20 |         
21 |         # Currently, can validate unexpected types
22 |         class A(object): pass
23 |         a = A()
24 |         vh.validate_object_type(a, A)
25 | 
26 |     def test_validate_object_type_with_invalid_type(self):
27 |         self.assertRaises(AssertionError, lambda: vh.validate_object_type('ABC', int))
28 |         self.assertRaises(AssertionError, lambda: vh.validate_object_type(123, str))
29 |         self.assertRaises(AssertionError, lambda: vh.validate_object_type(list(), dict))
30 |         self.assertRaises(AssertionError, lambda: vh.validate_object_type(dict(), list))
31 | 
32 |     def test_validate_object_type_with_unexpected_type(self):
33 |         class B(object): pass
34 |         self.assertRaises(KeyError, lambda: vh.validate_object_type(123, B))
35 | 
36 |     def test_validate_subclass_with_valid_class(self):
37 |         class C(object): pass
38 |         class D(C): pass
39 |         class E(D): pass
40 |         vh.validate_subclass(E, E)
41 |         vh.validate_subclass(E, D)
42 |         vh.validate_subclass(E, C)
43 | 
44 |     def test_validate_subclass_with_invalid_class(self):
45 |         class F(object): pass
46 |         class G(object): pass
47 |         class H(G): pass
48 |         self.assertRaises(AssertionError, lambda: vh.validate_subclass(G, F))
49 |         self.assertRaises(AssertionError, lambda: vh.validate_subclass(H, F))
50 | 


--------------------------------------------------------------------------------
/py_entitymatching/tests/utils.py:
--------------------------------------------------------------------------------
 1 | # Simplified knockoff of nose.tools.raises
 2 | # Thanks to zware for writing this for py_stringmatching
 3 | def raises(exc_type):
 4 |     def deco(f):
 5 |         def raises_wrapper(self):
 6 |             with self.assertRaises(exc_type):
 7 |                 return f(self)
 8 |         return raises_wrapper
 9 |     return deco
10 | 


--------------------------------------------------------------------------------
/py_entitymatching/triggers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhaidgroup/py_entitymatching/0eebc5823a5faac89934c52cfe2b7cb5f6e15e8a/py_entitymatching/triggers/__init__.py


--------------------------------------------------------------------------------
/py_entitymatching/tuner/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhaidgroup/py_entitymatching/0eebc5823a5faac89934c52cfe2b7cb5f6e15e8a/py_entitymatching/tuner/__init__.py


--------------------------------------------------------------------------------
/py_entitymatching/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # install path
2 | import os
3 | 
4 | install_path = os.path.dirname(os.path.realpath(__file__))
5 | 


--------------------------------------------------------------------------------
/py_entitymatching/utils/stop_words.txt:
--------------------------------------------------------------------------------
  1 | a
  2 | about
  3 | above
  4 | across
  5 | after
  6 | afterwards
  7 | again
  8 | against
  9 | all
 10 | almost
 11 | alone
 12 | along
 13 | already
 14 | also
 15 | although
 16 | always
 17 | am
 18 | among
 19 | amongst
 20 | amoungst
 21 | amount
 22 | an
 23 | and
 24 | another
 25 | any
 26 | anyhow
 27 | anyone
 28 | anything
 29 | anyway
 30 | anywhere
 31 | are
 32 | around
 33 | as
 34 | at
 35 | back
 36 | be
 37 | became
 38 | because
 39 | become
 40 | becomes
 41 | becoming
 42 | been
 43 | before
 44 | beforehand
 45 | behind
 46 | being
 47 | below
 48 | beside
 49 | besides
 50 | between
 51 | beyond
 52 | bill
 53 | both
 54 | bottom
 55 | but
 56 | by
 57 | call
 58 | can
 59 | cannot
 60 | cant
 61 | co
 62 | con
 63 | could
 64 | couldnt
 65 | cry
 66 | de
 67 | describe
 68 | detail
 69 | do
 70 | done
 71 | down
 72 | due
 73 | during
 74 | each
 75 | eg
 76 | eight
 77 | either
 78 | eleven
 79 | else
 80 | elsewhere
 81 | empty
 82 | enough
 83 | etc
 84 | even
 85 | ever
 86 | every
 87 | everyone
 88 | everything
 89 | everywhere
 90 | except
 91 | few
 92 | fifteen
 93 | fify
 94 | fill
 95 | find
 96 | fire
 97 | first
 98 | five
 99 | for
100 | former
101 | formerly
102 | forty
103 | found
104 | four
105 | from
106 | front
107 | full
108 | further
109 | get
110 | give
111 | go
112 | had
113 | has
114 | hasnt
115 | have
116 | he
117 | hence
118 | her
119 | here
120 | hereafter
121 | hereby
122 | herein
123 | hereupon
124 | hers
125 | herself
126 | him
127 | himself
128 | his
129 | how
130 | however
131 | hundred
132 | ie
133 | if
134 | in
135 | inc
136 | indeed
137 | interest
138 | into
139 | is
140 | it
141 | its
142 | itself
143 | keep
144 | last
145 | latter
146 | latterly
147 | least
148 | less
149 | ltd
150 | made
151 | many
152 | may
153 | me
154 | meanwhile
155 | might
156 | mill
157 | mine
158 | more
159 | moreover
160 | most
161 | mostly
162 | move
163 | much
164 | must
165 | my
166 | myself
167 | name
168 | namely
169 | neither
170 | never
171 | nevertheless
172 | next
173 | nine
174 | no
175 | nobody
176 | none
177 | noone
178 | nor
179 | not
180 | nothing
181 | now
182 | nowhere
183 | of
184 | off
185 | often
186 | on
187 | once
188 | one
189 | only
190 | onto
191 | or
192 | other
193 | others
194 | otherwise
195 | our
196 | ours
197 | ourselves
198 | out
199 | over
200 | own
201 | part
202 | per
203 | perhaps
204 | please
205 | put
206 | rather
207 | re
208 | same
209 | see
210 | seem
211 | seemed
212 | seeming
213 | seems
214 | serious
215 | several
216 | she
217 | should
218 | show
219 | side
220 | since
221 | sincere
222 | six
223 | sixty
224 | so
225 | some
226 | somehow
227 | someone
228 | something
229 | sometime
230 | sometimes
231 | somewhere
232 | still
233 | such
234 | system
235 | take
236 | ten
237 | than
238 | that
239 | the
240 | their
241 | them
242 | themselves
243 | then
244 | thence
245 | there
246 | thereafter
247 | thereby
248 | therefore
249 | therein
250 | thereupon
251 | these
252 | they
253 | thickv
254 | thin
255 | third
256 | this
257 | those
258 | though
259 | three
260 | through
261 | throughout
262 | thru
263 | thus
264 | to
265 | together
266 | too
267 | top
268 | toward
269 | towards
270 | twelve
271 | twenty
272 | two
273 | un
274 | under
275 | until
276 | up
277 | upon
278 | us
279 | very
280 | via
281 | was
282 | we
283 | well
284 | were
285 | what
286 | whatever
287 | when
288 | whence
289 | whenever
290 | where
291 | whereafter
292 | whereas
293 | whereby
294 | wherein
295 | whereupon
296 | wherever
297 | whether
298 | which
299 | while
300 | whither
301 | who
302 | whoever
303 | whole
304 | whom
305 | whose
306 | why
307 | will
308 | with
309 | within
310 | without
311 | would
312 | yet
313 | you
314 | your
315 | yours
316 | yourself
317 | yourselves
318 | 


--------------------------------------------------------------------------------
/py_entitymatching/utils/validation_helper.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import six
 3 | 
 4 | 
 5 | def type_name(expected_type):
 6 |     messages = {
 7 |         six.string_types: 'string',
 8 |         pd.DataFrame: 'pandas dataframe',
 9 |         list: 'list',
10 |         bool: 'bool',
11 |         int: 'int',
12 |         dict: 'dictionary',
13 |         str: 'str',
14 |     }
15 |     return messages[expected_type]
16 | 
17 | 
18 | def validate_object_type(input_object, expected_type, error_prefix='Input object'):
19 |     if not isinstance(input_object, expected_type):
20 |         error_message = '{0}: {1} \nis not of type {2}'.format(error_prefix, str(input_object), type_name(expected_type))
21 |         raise AssertionError(error_message)
22 | 
23 | 
24 | def validate_subclass(input_class, expected_class, error_prefix='Input class'):
25 |     if not issubclass(input_class, expected_class):
26 |         error_message = f'{error_prefix}: {str(input_class)}\nis is not a sublcass of {str(expected_class)}'
27 |         raise AssertionError(error_message)
28 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | requests
 2 | ipython>=5.6
 3 | matplotlib>=2.2.4
 4 | PyPrind==2.9.8
 5 | py-stringmatching>=0.2.1
 6 | py-stringsimjoin>=0.3.0
 7 | numpy
 8 | scikit-learn>=0.22
 9 | scipy>=1.3.2
10 | cloudpickle
11 | 


--------------------------------------------------------------------------------
/requirements.yml:
--------------------------------------------------------------------------------
 1 | name: py_entitymatching_dev
 2 | channels:
 3 |   - conda-forge
 4 |   - uwmagellan
 5 |   - defaults
 6 | dependencies:
 7 |     - ipython == 5.6
 8 |     - matplotlib => 2.2.4
 9 |     - setuptools
10 |     - py_stringsimjoin >= 0.3.0
11 |     - cloudpickle
12 |     - pyparsing
13 |     - scikit-learn >= 0.18
14 |     - pyqt
15 |     - py_stringmatching
16 |     - requests
17 |     - cloudpickle
18 | #    - xgboost
19 | 


--------------------------------------------------------------------------------