├── .coveragerc ├── .github └── workflows │ ├── deploy.yml │ └── tests.yml ├── .gitignore ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.rst ├── binder ├── README.md ├── postBuild └── requirements.txt ├── docs ├── Makefile ├── _static │ ├── logo-icon.png │ ├── logo.png │ └── style.css ├── conf.py ├── images │ ├── overview-basic-workflow.png │ ├── overview-blocking-tables.png │ ├── overview-blocking-workflow.png │ ├── overview-dataflow.png │ ├── overview-evaluation-workflow.png │ ├── overview-inputs.png │ └── overview-tables.png ├── index.rst ├── installation.rst ├── mod_blocking.rst ├── mod_cli.rst ├── mod_dataset.rst ├── mod_evaluation.rst ├── mod_io.rst ├── mod_record.rst ├── mod_remote.rst ├── mod_similarity.rst ├── mod_tokenizer.rst ├── mod_utils.rst ├── modules.rst ├── overview.rst ├── real_world_example.ipynb ├── resources │ ├── dblp.csv │ ├── dblp_scholar_gt.csv │ └── scholar.jl ├── scaling_and_optimization.rst └── step_by_step.ipynb ├── examples ├── basic │ ├── auto_record.py │ ├── basic.py │ ├── cached_record.py │ ├── dedup.py │ ├── ds1.csv │ └── ds2.jl ├── blocking │ ├── block_io_operations.py │ ├── canopy.py │ ├── ds1.csv │ ├── ds2.jl │ ├── generate_blocks.py │ └── inverted_index.py └── evaluation │ ├── .gitignore │ ├── __init__.py │ ├── construct_datasets.py │ ├── data_1.csv │ ├── data_2.csv │ ├── generate_negative_gt.py │ ├── ground_truth.csv │ ├── gt_positive_only.csv │ └── run_evaluation.py ├── requirements.txt ├── requirements_dev.txt ├── requirements_docs.txt ├── rltk ├── __init__.py ├── __main__.py ├── blocking │ ├── __init__.py │ ├── _minhash_lsh.py │ ├── block.py │ ├── block_black_list.py │ ├── block_generator.py │ ├── blocking_helper.py │ ├── canopy_block_generator.py │ ├── hash_block_generator.py │ ├── sorted_neighbourhood_block_generator.py │ └── token_block_generator.py ├── cli.py ├── dataset.py ├── evaluation │ ├── __init__.py │ ├── evaluation.py │ ├── ground_truth.py │ └── trial.py ├── io │ ├── __init__.py │ ├── adapter │ │ ├── __init__.py │ │ ├── dbm_key_value_adapter.py │ │ ├── hbase_key_value_adapter.py │ │ ├── key_set_adapter.py │ │ ├── key_value_adapter.py │ │ ├── leveldb_key_set_adapter.py │ │ ├── memory_key_set_adapter.py │ │ ├── memory_key_value_adapter.py │ │ ├── redis_key_set_adapter.py │ │ └── redis_key_value_adapter.py │ ├── io_utils.py │ ├── reader │ │ ├── __init__.py │ │ ├── array_reader.py │ │ ├── csv_reader.py │ │ ├── dataframe_reader.py │ │ ├── ground_truth_reader.py │ │ ├── jsonlines_reader.py │ │ └── reader.py │ ├── serializer │ │ ├── __init__.py │ │ ├── pickle_serializer.py │ │ └── serializer.py │ └── writer │ │ ├── __init__.py │ │ ├── ground_truth_writer.py │ │ └── writer.py ├── record.py ├── remote │ ├── __init__.py │ ├── remote.py │ └── task.py ├── similarity │ ├── __init__.py │ ├── cosine.py │ ├── dice.py │ ├── distance.py │ ├── equal.py │ ├── hamming.py │ ├── hybrid.py │ ├── jaccard.py │ ├── jaro.py │ ├── lcs.py │ ├── levenshtein.py │ ├── metaphone.py │ ├── needleman.py │ ├── ngram.py │ ├── nysiis.py │ ├── qgram.py │ ├── soundex.py │ └── tf_idf.py ├── tests │ ├── __init__.py │ ├── test_blocking.py │ ├── test_io_adapter.py │ ├── test_io_reader.py │ ├── test_similarity.py │ └── test_trial.py ├── tokenizer │ ├── __init__.py │ └── crf_tokenizer │ │ ├── LICENSE │ │ ├── README.md │ │ ├── __init__.py │ │ └── crf_tokenizer.py └── utils.py └── setup.py /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | omit = rltk/tests/* 3 | 4 | -------------------------------------------------------------------------------- /.github/workflows/deploy.yml: -------------------------------------------------------------------------------- 1 | name: Deploy 2 | on: 3 | push: 4 | tags: 5 | - '*' 6 | jobs: 7 | deploy-to-pypi: 8 | name: Deploy to pypi 9 | runs-on: ubuntu-latest 10 | steps: 11 | - name: checkout code 12 | uses: actions/checkout@v2 13 | - name: Set up Python 14 | uses: actions/setup-python@v2 15 | with: 16 | python-version: '3.7' 17 | - name: Install dependencies 18 | run: | 19 | python -m pip install --upgrade pip 20 | pip install setuptools wheel twine 21 | - name: Build and publish 22 | env: 23 | TWINE_USERNAME: usc_isi_i2_admin 24 | TWINE_PASSWORD: ${{ secrets.PYPI }} 25 | run: | 26 | python setup.py sdist bdist_wheel 27 | twine upload dist/* 28 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | on: push 3 | jobs: 4 | run-tests: 5 | name: Run pytest 6 | runs-on: ubuntu-latest 7 | strategy: 8 | matrix: 9 | python-version: [3.7, 3.8, 3.9] 10 | steps: 11 | - name: Checkout code 12 | uses: actions/checkout@v2 13 | - name: Set up Python ${{ matrix.python-version }} 14 | uses: actions/setup-python@v2 15 | with: 16 | python-version: ${{ matrix.python-version }} 17 | - name: Install dependencies 18 | run: | 19 | python -m pip install --upgrade pip 20 | pip install -r requirements.txt 21 | pip install -r requirements_dev.txt 22 | pip install -e . 23 | pip install coverage coveralls 24 | - name: Test with pytest 25 | run: | 26 | python -m pytest -v --color=yes --cov rltk rltk/tests/test_* 27 | - name: Coverage 28 | env: 29 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 30 | run: | 31 | coveralls --service=github 32 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | 91 | .idea/ 92 | dev_test/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 University of Southern California 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include VERSION 3 | include requirements.txt -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: docs 2 | 3 | docs: 4 | @cd docs && make html 5 | 6 | run-docs: 7 | @cd docs/_build/html && python -m http.server 8080 --bind localhost 8 | 9 | release: 10 | @VERSION=$$(python -c "import rltk;print(rltk.__version__)") && git tag $$VERSION 11 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | RLTK: Record Linkage ToolKit 2 | ============================ 3 | 4 | .. begin-intro 5 | .. image:: https://img.shields.io/badge/license-MIT-blue.svg 6 | :target: https://raw.githubusercontent.com/usc-isi-i2/rltk/master/LICENSE 7 | :alt: License 8 | 9 | .. image:: https://github.com/usc-isi-i2/rltk/workflows/Tests/badge.svg?branch=master 10 | :target: https://github.com/usc-isi-i2/rltk/actions 11 | :alt: Github actions 12 | 13 | .. image:: https://coveralls.io/repos/github/usc-isi-i2/rltk/badge.svg?branch=master 14 | :target: https://coveralls.io/github/usc-isi-i2/rltk?branch=master 15 | :alt: Coveralls 16 | 17 | .. image:: https://badge.fury.io/py/rltk.svg 18 | :target: https://badge.fury.io/py/rltk 19 | :alt: pypi 20 | 21 | .. image:: https://readthedocs.org/projects/rltk/badge/?version=latest 22 | :target: http://rltk.readthedocs.io/en/latest 23 | :alt: Documents 24 | 25 | The Record Linkage ToolKit (RLTK) is a general-purpose open-source record linkage platform that allows users to build powerful Python programs that link records referring to the same underlying entity. Record linkage is an extremely important problem that shows up in domains extending from social networks to bibliographic data and biomedicine. Current open platforms for record linkage have problems scaling even to moderately sized datasets, or are just not easy to use (even by experts). RLTK attempts to address all of these issues. 26 | 27 | RLTK supports a full, scalable record linkage pipeline, including multi-core algorithms for blocking, profiling data, computing a wide variety of features, and training and applying machine learning classifiers based on Python’s sklearn library. An end-to-end RLTK pipeline can be jump-started with only a few lines of code. However, RLTK is also designed to be extensible and customizable, allowing users arbitrary degrees of control over many of the individual components. You can add new features to RLTK (e.g. a custom string similarity) very easily. 28 | 29 | RLTK is being built by the `Center on Knowledge Graphs `_ at `USC/ISI `_, with funding from multiple projects funded by the DARPA LORELEI and MEMEX programs and the IARPA CAUSE program. 30 | RLTK is under active maintenance and we expect to keep adding new features and state-of-the-art record linkage algorithms in the foreseeable future, in addition to continuously supporting our adopters to integrate the platform into their applications. 31 | 32 | Getting Started 33 | --------------- 34 | 35 | Installation (make sure prerequisites are installed):: 36 | 37 | pip install -U rltk 38 | 39 | Example:: 40 | 41 | >>> import rltk 42 | >>> rltk.levenshtein_distance('abc', 'abd') 43 | 1 44 | 45 | Try RLTK Online 46 | --------------- 47 | 48 | * `Stable version `_ 49 | * `Development version `_ 50 | 51 | .. end-intro 52 | 53 | Datasets & Experiments 54 | ---------------------- 55 | * `rltk-experimentation `_ 56 | 57 | Documentation 58 | ------------- 59 | 60 | * `Tutorials `_ 61 | * `API Reference `_ 62 | -------------------------------------------------------------------------------- /binder/README.md: -------------------------------------------------------------------------------- 1 | # RLTK Jupyter Binder 2 | 3 | This folder is used by [Binder](https://mybinder.org/). 4 | -------------------------------------------------------------------------------- /binder/postBuild: -------------------------------------------------------------------------------- 1 | git clone --depth 1 -b master https://github.com/usc-isi-i2/rltk-experimentation 2 | -------------------------------------------------------------------------------- /binder/requirements.txt: -------------------------------------------------------------------------------- 1 | scikit-learn 2 | -r ../requirements.txt 3 | -e . 4 | 5 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = rltk 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /docs/_static/logo-icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/usc-isi-i2/rltk/aee10ed5dd561583e60db3373ed82fe1208da1e9/docs/_static/logo-icon.png -------------------------------------------------------------------------------- /docs/_static/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/usc-isi-i2/rltk/aee10ed5dd561583e60db3373ed82fe1208da1e9/docs/_static/logo.png -------------------------------------------------------------------------------- /docs/_static/style.css: -------------------------------------------------------------------------------- 1 | @import url("https://fonts.googleapis.com/css?family=Ubuntu+Mono"); 2 | @import url("https://fonts.googleapis.com/css?family=Open+Sans"); 3 | 4 | pre, code { 5 | font-family: "Ubuntu Mono", "Consolas", "Menlo", "DejaVu Sans Mono", "Bitstream Vera Sans Mono", monospace; 6 | font-size: 15px; 7 | } 8 | 9 | h1, h2, h3, h4, h5, h6, p.admonition-title, div.sphinxsidebar input, body { 10 | font-family: "Open Sans", "Helvetica", "Arial", sans-serif; 11 | } 12 | 13 | div.sphinxsidebar ul li.toctree-l1 > a { 14 | font-size: 100%; 15 | } 16 | 17 | div.sphinxsidebar ul li.toctree-l2 > a { 18 | font-size: 100%; 19 | } 20 | 21 | div.sphinxsidebar ul li.toctree-l3 > a { 22 | font-size: 100%; 23 | } 24 | 25 | div.body { 26 | max-width: 100%; /* overwrite basic.css */ 27 | } 28 | 29 | table.dataframe { 30 | border-collapse: collapse; 31 | /*width: 100%;*/ 32 | } 33 | 34 | table.dataframe th, table.dataframe td { 35 | text-align: left; 36 | padding: 8px; 37 | } 38 | 39 | table.dataframe tr:nth-child(even) { 40 | background-color: #f2f2f2; 41 | } 42 | 43 | blockquote { 44 | border-left: 5px solid #eeeeee; 45 | padding: 10px 20px; 46 | } 47 | 48 | div.sphinxsidebarwrapper p.logo { 49 | margin-bottom: 30px; 50 | } -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # rltk documentation build configuration file, created by 4 | # sphinx-quickstart on Thu Feb 23 13:46:31 2017. 5 | # 6 | # This file is execfile()d with the current directory set to its 7 | # containing dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | # If extensions (or modules to document with autodoc) are in another directory, 16 | # add these directories to sys.path here. If the directory is relative to the 17 | # documentation root, use os.path.abspath to make it absolute, like shown here. 18 | # 19 | import os 20 | import sys 21 | import datetime 22 | sys.path.insert(0, os.path.abspath('../rltk')) 23 | sys.path.insert(0, os.path.abspath('../')) 24 | 25 | 26 | # -- General configuration ------------------------------------------------ 27 | 28 | # If your documentation needs a minimal Sphinx version, state it here. 29 | # 30 | # needs_sphinx = '1.0' 31 | 32 | # Add any Sphinx extension module names here, as strings. They can be 33 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 34 | # ones. 35 | extensions = ['sphinx.ext.autodoc', 'sphinxcontrib.napoleon', 'nbsphinx', 36 | 'IPython.sphinxext.ipython_console_highlighting'] # 'sphinx.ext.viewcode' 37 | 38 | # Add any paths that contain templates here, relative to this directory. 39 | templates_path = ['_templates'] 40 | 41 | # The suffix(es) of source filenames. 42 | # You can specify multiple suffix as a list of string: 43 | # 44 | # source_suffix = ['.rst', '.md'] 45 | source_suffix = '.rst' 46 | 47 | # The master toctree document. 48 | master_doc = 'index' 49 | 50 | # General information about the project. 51 | project = 'RLTK' 52 | copyright = '{}, USC/ISI'.format(datetime.datetime.now().year) 53 | author = 'USC/ISI' 54 | 55 | # The version info for the project you're documenting, acts as replacement for 56 | # |version| and |release|, also used in various other places throughout the 57 | # built documents. 58 | # 59 | with open('../rltk/__init__.py', 'r') as f: 60 | for line in f: 61 | if line.startswith('__version__'): 62 | exec(line) # fetch and create __version__ 63 | break 64 | rltk_version = __version__ 65 | # The short X.Y version. 66 | version = '.'.join(rltk_version.split('.')[:2]) 67 | # The full version, including alpha/beta/rc tags. 68 | release = rltk_version 69 | 70 | # The language for content autogenerated by Sphinx. Refer to documentation 71 | # for a list of supported languages. 72 | # 73 | # This is also used if you do content translation via gettext catalogs. 74 | # Usually you set "language" from the command line for these cases. 75 | language = None 76 | 77 | # List of patterns, relative to source directory, that match files and 78 | # directories to ignore when looking for source files. 79 | # This patterns also effect to html_static_path and html_extra_path 80 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', '**.ipynb_checkpoints'] 81 | 82 | # The name of the Pygments (syntax highlighting) style to use. 83 | pygments_style = 'sphinx' 84 | 85 | # If true, `todo` and `todoList` produce output, else they produce nothing. 86 | todo_include_todos = False 87 | 88 | 89 | # -- Options for HTML output ---------------------------------------------- 90 | 91 | # The theme to use for HTML and HTML Help pages. See the documentation for 92 | # a list of builtin themes. 93 | # 94 | html_theme = 'alabaster' # default, alabaster, pyramid, bizstyle 95 | 96 | # Theme options are theme-specific and customize the look and feel of a theme 97 | # further. For a list of options available for each theme, see the 98 | # documentation. 99 | # 100 | html_theme_options = { 101 | 'logo': 'logo-icon.png', 102 | 'page_width': '1380px', 103 | 'sidebar_width': '220px', 104 | 'github_user': 'usc-isi-i2', 105 | 'github_repo': 'rltk', 106 | 'github_banner': 'true', 107 | 'github_type': 'star', 108 | 'extra_nav_links': { 109 | 'RLTK @ GitHub': 'https://github.com/usc-isi-i2/rltk', 110 | 'RLTK @ PyPI': 'https://pypi.org/project/rltk', 111 | 'Issue Tracker': 'https://github.com/usc-isi-i2/rltk/issues', 112 | 'USC/ISI CKG': 'http://usc-isi-i2.github.io/' 113 | }, 114 | 'show_powered_by': False 115 | } 116 | 117 | html_show_sourcelink = False 118 | 119 | html_sidebars = { 120 | '**': [ 121 | 'about.html', 122 | 'localtoc.html', 123 | 'navigation.html', 124 | # 'relations.html', 125 | 'searchbox.html', 126 | # 'donate.html', 127 | ] 128 | } 129 | 130 | # Add any paths that contain custom static files (such as style sheets) here, 131 | # relative to this directory. They are copied after the builtin static files, 132 | # so a file named "default.css" will overwrite the builtin "default.css". 133 | html_static_path = ['_static'] 134 | 135 | 136 | # -- Options for HTMLHelp output ------------------------------------------ 137 | 138 | # Output file base name for HTML help builder. 139 | htmlhelp_basename = 'rltkdoc' 140 | 141 | 142 | # -- Options for LaTeX output --------------------------------------------- 143 | 144 | latex_elements = { 145 | # The paper size ('letterpaper' or 'a4paper'). 146 | # 147 | # 'papersize': 'letterpaper', 148 | 149 | # The font size ('10pt', '11pt' or '12pt'). 150 | # 151 | # 'pointsize': '10pt', 152 | 153 | # Additional stuff for the LaTeX preamble. 154 | # 155 | # 'preamble': '', 156 | 157 | # Latex figure (float) alignment 158 | # 159 | # 'figure_align': 'htbp', 160 | } 161 | 162 | # Grouping the document tree into LaTeX files. List of tuples 163 | # (source start file, target name, title, 164 | # author, documentclass [howto, manual, or own class]). 165 | latex_documents = [ 166 | (master_doc, 'rltk.tex', 'RLTK Documentation', 167 | u'USC/ISI', 'manual'), 168 | ] 169 | 170 | 171 | # -- Options for manual page output --------------------------------------- 172 | 173 | # One entry per manual page. List of tuples 174 | # (source start file, name, description, authors, manual section). 175 | man_pages = [ 176 | (master_doc, 'rltk', 'RLTK Documentation', 177 | [author], 1) 178 | ] 179 | 180 | 181 | # -- Options for Texinfo output ------------------------------------------- 182 | 183 | # Grouping the document tree into Texinfo files. List of tuples 184 | # (source start file, target name, title, author, 185 | # dir menu entry, description, category) 186 | texinfo_documents = [ 187 | (master_doc, 'rltk', 'RLTK Documentation', 188 | author, 'rltk', 'Record Linkage ToolKit', 189 | 'Miscellaneous'), 190 | ] 191 | 192 | 193 | def setup(app): 194 | app.add_stylesheet('style.css') 195 | -------------------------------------------------------------------------------- /docs/images/overview-basic-workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/usc-isi-i2/rltk/aee10ed5dd561583e60db3373ed82fe1208da1e9/docs/images/overview-basic-workflow.png -------------------------------------------------------------------------------- /docs/images/overview-blocking-tables.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/usc-isi-i2/rltk/aee10ed5dd561583e60db3373ed82fe1208da1e9/docs/images/overview-blocking-tables.png -------------------------------------------------------------------------------- /docs/images/overview-blocking-workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/usc-isi-i2/rltk/aee10ed5dd561583e60db3373ed82fe1208da1e9/docs/images/overview-blocking-workflow.png -------------------------------------------------------------------------------- /docs/images/overview-dataflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/usc-isi-i2/rltk/aee10ed5dd561583e60db3373ed82fe1208da1e9/docs/images/overview-dataflow.png -------------------------------------------------------------------------------- /docs/images/overview-evaluation-workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/usc-isi-i2/rltk/aee10ed5dd561583e60db3373ed82fe1208da1e9/docs/images/overview-evaluation-workflow.png -------------------------------------------------------------------------------- /docs/images/overview-inputs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/usc-isi-i2/rltk/aee10ed5dd561583e60db3373ed82fe1208da1e9/docs/images/overview-inputs.png -------------------------------------------------------------------------------- /docs/images/overview-tables.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/usc-isi-i2/rltk/aee10ed5dd561583e60db3373ed82fe1208da1e9/docs/images/overview-tables.png -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. rltk documentation master file, created by 2 | sphinx-quickstart on Thu Feb 23 13:46:31 2017. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | .. include:: ./../README.rst 7 | :start-after: begin-intro 8 | :end-before: end-intro 9 | 10 | 11 | Tutorial 12 | ------------- 13 | 14 | .. toctree:: 15 | :maxdepth: 2 16 | 17 | installation.rst 18 | overview.rst 19 | step_by_step.ipynb 20 | real_world_example.ipynb 21 | scaling_and_optimization.rst 22 | 23 | 24 | API Reference 25 | ------------- 26 | 27 | .. toctree:: 28 | :maxdepth: 3 29 | 30 | modules.rst 31 | -------------------------------------------------------------------------------- /docs/installation.rst: -------------------------------------------------------------------------------- 1 | Installation 2 | ============ 3 | 4 | .. note:: 5 | 6 | RLTK only supports Python 3 and it's tested under Python 3.7+. 7 | 8 | 9 | pip 10 | ---- 11 | 12 | Using pip to install:: 13 | 14 | pip install rltk 15 | 16 | If you want to update RLTK:: 17 | 18 | pip install -U rltk 19 | 20 | Generally, it's recommended to install packages in a virtual environment:: 21 | 22 | virtualenv rltk_env 23 | source activate rltk_env 24 | pip install rltk 25 | 26 | 27 | Install from source 28 | ------------------- 29 | 30 | The other way to install RLTK is to clone from GitHub repository and build it from source:: 31 | 32 | git clone https://github.com/usc-isi-i2/rltk.git 33 | cd rltk 34 | 35 | virtualenv rltk_env 36 | source activate rltk_env 37 | pip install -e . 38 | 39 | Run tests 40 | --------- 41 | 42 | RLTK uses `pytest `_ for unit tests. To run them, simply do following command from the root of rltk package:: 43 | 44 | pytest 45 | 46 | If you need more detailed information, do:: 47 | 48 | pytest -v --color=yes 49 | 50 | Build documentation 51 | ------------------- 52 | 53 | Additional dependencies for building documentation should be installed first: 54 | 55 | pip install -r requirements_docs.txt 56 | 57 | Documentation is powered by `Sphinx `_ , to generate it on your local, please run:: 58 | 59 | cd docs 60 | make html # the generated doc is located at _build/html/index.html 61 | -------------------------------------------------------------------------------- /docs/mod_blocking.rst: -------------------------------------------------------------------------------- 1 | Blocking 2 | ======== 3 | 4 | Block 5 | ----- 6 | 7 | .. automodule:: rltk.blocking.block 8 | :members: 9 | :special-members: 10 | :exclude-members: __dict__, __weakref__, __init__ 11 | 12 | 13 | Block Black List 14 | ---------------- 15 | 16 | .. automodule:: rltk.blocking.block_black_list 17 | :members: 18 | :special-members: 19 | :exclude-members: __dict__, __weakref__, __init__ 20 | 21 | Block Generator 22 | --------------- 23 | 24 | .. automodule:: rltk.blocking.block_generator 25 | :members: 26 | :special-members: 27 | :exclude-members: __dict__, __weakref__, __init__ 28 | 29 | .. automodule:: rltk.blocking.hash_block_generator 30 | :members: 31 | :special-members: 32 | :exclude-members: __dict__, __weakref__, __init__ 33 | 34 | .. automodule:: rltk.blocking.token_block_generator 35 | :members: 36 | :special-members: 37 | :exclude-members: __dict__, __weakref__, __init__ 38 | 39 | .. automodule:: rltk.blocking.sorted_neighbourhood_block_generator 40 | :members: 41 | :special-members: 42 | :exclude-members: __dict__, __weakref__, __init__ 43 | 44 | .. automodule:: rltk.blocking.canopy_block_generator 45 | :members: 46 | :special-members: 47 | :exclude-members: __dict__, __weakref__, __init__ 48 | 49 | Blocking Helper 50 | --------------- 51 | 52 | .. automodule:: rltk.blocking.blocking_helper 53 | :members: 54 | :special-members: 55 | :exclude-members: __dict__, __weakref__, __init__ 56 | -------------------------------------------------------------------------------- /docs/mod_cli.rst: -------------------------------------------------------------------------------- 1 | Command line interface (CLI) 2 | ---------------------------- 3 | 4 | .. automodule:: rltk.cli 5 | :members: 6 | -------------------------------------------------------------------------------- /docs/mod_dataset.rst: -------------------------------------------------------------------------------- 1 | Dataset 2 | ======= 3 | 4 | .. automodule:: rltk.dataset 5 | :members: 6 | :special-members: 7 | :exclude-members: __dict__, __weakref__, __init__ -------------------------------------------------------------------------------- /docs/mod_evaluation.rst: -------------------------------------------------------------------------------- 1 | Evaluation 2 | ========== 3 | 4 | GroundTruth 5 | ----------- 6 | 7 | .. automodule:: rltk.evaluation.ground_truth 8 | :members: 9 | :special-members: 10 | :exclude-members: __dict__, __weakref__, __init__ 11 | 12 | Trial 13 | ----- 14 | 15 | .. automodule:: rltk.evaluation.trial 16 | :members: 17 | :special-members: 18 | :exclude-members: __dict__, __weakref__, __init__ 19 | 20 | Evaluation 21 | ---------- 22 | 23 | .. automodule:: rltk.evaluation.evaluation 24 | :members: 25 | :special-members: 26 | :exclude-members: __dict__, __weakref__, __init__ 27 | -------------------------------------------------------------------------------- /docs/mod_io.rst: -------------------------------------------------------------------------------- 1 | IO (Input & Output) 2 | =================== 3 | 4 | Reader 5 | ------ 6 | 7 | Generic Reader 8 | ^^^^^^^^^^^^^^ 9 | 10 | .. automodule:: rltk.io.reader.reader 11 | :members: 12 | :special-members: 13 | :exclude-members: __dict__, __weakref__, __init__ 14 | 15 | .. automodule:: rltk.io.reader.array_reader 16 | :members: 17 | :special-members: 18 | :exclude-members: __dict__, __weakref__, __init__ 19 | 20 | .. automodule:: rltk.io.reader.jsonlines_reader 21 | :members: 22 | :special-members: 23 | :exclude-members: __dict__, __weakref__, __init__ 24 | 25 | .. automodule:: rltk.io.reader.csv_reader 26 | :members: 27 | :special-members: 28 | :exclude-members: __dict__, __weakref__, __init__ 29 | 30 | .. automodule:: rltk.io.reader.dataframe_reader 31 | :members: 32 | :special-members: 33 | :exclude-members: __dict__, __weakref__, __init__ 34 | 35 | GroundTruth Reader 36 | ^^^^^^^^^^^^^^^^^^ 37 | 38 | .. automodule:: rltk.io.reader.ground_truth_reader 39 | :members: 40 | :special-members: 41 | :exclude-members: __dict__, __weakref__, __init__ 42 | 43 | Writer 44 | ------ 45 | 46 | Generic Writer 47 | ^^^^^^^^^^^^^^ 48 | 49 | .. automodule:: rltk.io.writer.writer 50 | :members: 51 | :special-members: 52 | :exclude-members: __dict__, __weakref__, __init__ 53 | 54 | GroundTruth Writer 55 | ^^^^^^^^^^^^^^^^^^ 56 | 57 | .. automodule:: rltk.io.writer.ground_truth_writer 58 | :members: 59 | :special-members: 60 | :exclude-members: __dict__, __weakref__, __init__ 61 | 62 | Adapter 63 | ------- 64 | 65 | Key Value Adapter 66 | ^^^^^^^^^^^^^^^^^ 67 | 68 | .. automodule:: rltk.io.adapter.key_value_adapter 69 | :members: 70 | :special-members: 71 | :exclude-members: __dict__, __weakref__, __init__ 72 | 73 | .. automodule:: rltk.io.adapter.memory_key_value_adapter 74 | :members: 75 | :special-members: 76 | :exclude-members: __dict__, __weakref__, __init__ 77 | 78 | .. automodule:: rltk.io.adapter.redis_key_value_adapter 79 | :members: 80 | :special-members: 81 | :exclude-members: __dict__, __weakref__, __init__ 82 | 83 | .. automodule:: rltk.io.adapter.hbase_key_value_adapter 84 | :members: 85 | :special-members: 86 | :exclude-members: __dict__, __weakref__, __init__ 87 | 88 | .. automodule:: rltk.io.adapter.dbm_key_value_adapter 89 | :members: 90 | :special-members: 91 | :exclude-members: __dict__, __weakref__, __init__ 92 | 93 | Key Set Adapter 94 | ^^^^^^^^^^^^^^^ 95 | 96 | .. automodule:: rltk.io.adapter.key_set_adapter 97 | :members: 98 | :special-members: 99 | :exclude-members: __dict__, __weakref__, __init__ 100 | 101 | .. automodule:: rltk.io.adapter.memory_key_set_adapter 102 | :members: 103 | :special-members: 104 | :exclude-members: __dict__, __weakref__, __init__ 105 | 106 | .. automodule:: rltk.io.adapter.leveldb_key_set_adapter 107 | :members: 108 | :special-members: 109 | :exclude-members: __dict__, __weakref__, __init__ 110 | 111 | .. automodule:: rltk.io.adapter.redis_key_set_adapter 112 | :members: 113 | :special-members: 114 | :exclude-members: __dict__, __weakref__, __init__ 115 | 116 | Serializer 117 | ---------- 118 | 119 | .. automodule:: rltk.io.serializer.serializer 120 | :members: 121 | :special-members: 122 | :exclude-members: __dict__, __weakref__, __init__ 123 | 124 | .. automodule:: rltk.io.serializer.pickle_serializer 125 | :members: 126 | :special-members: 127 | :exclude-members: __dict__, __weakref__, __init__ 128 | 129 | Utilities 130 | --------- 131 | 132 | .. automodule:: rltk.io.io_utils 133 | :members: 134 | :special-members: 135 | :exclude-members: __dict__, __weakref__, __init__ 136 | -------------------------------------------------------------------------------- /docs/mod_record.rst: -------------------------------------------------------------------------------- 1 | Record 2 | ====== 3 | 4 | .. automodule:: rltk.record 5 | :members: 6 | :special-members: 7 | :exclude-members: __dict__, __weakref__, __init__ -------------------------------------------------------------------------------- /docs/mod_remote.rst: -------------------------------------------------------------------------------- 1 | Remote 2 | ====== 3 | 4 | RLTK's remote module is based on `Dask's distributed `_. It has `scheduler` which coordinates the actions of several `worker` s spread across multiple machines and the concurrent requests of several clients. 5 | 6 | To start scheduler, do: 7 | 8 | .. code-block:: bash 9 | 10 | python -m rltk remote.scheduler --port 11 | 12 | Then on worker machines, do 13 | 14 | .. code-block:: bash 15 | 16 | python -m rltk remote.worker : --nprocs 17 | 18 | Authentication is supported through Privacy Enhanced Mail (PEM) files. You can either get them from CA (Certificate Authority) or generate self-signed PEM locally. Here's an example of generating PEM by using `OpenSSL `_: 19 | 20 | .. code-block:: bash 21 | 22 | openssl req -newkey rsa:2048 -new -nodes -x509 -days 3650 -keyout key.pem -out cert.pem 23 | 24 | Then provide these PEM files while starting scheduler and workers. If you don't have CA certificate, set `tls-ca-file` same to `tls-cert`. 25 | 26 | .. code-block:: bash 27 | 28 | # scheduler 29 | python -m rltk remote.scheduler --port --tls-ca-file cert.pem --tls-cert cert.pem --tls-key key.pem 30 | 31 | # worker, specify protocol TLS in scheduler's address 32 | python -m rltk remote.worker tls://: --tls-ca-file cert.pem --tls-cert cert.pem --tls-key key.pem 33 | 34 | Dask provides a web UI to monitor scheduler and worker status, detailed usage can be found `here `_. 35 | 36 | Remote 37 | ------ 38 | 39 | .. automodule:: rltk.remote.remote 40 | :members: 41 | :special-members: 42 | :exclude-members: __dict__, __weakref__, __init__ 43 | 44 | Task 45 | ---- 46 | 47 | .. automodule:: rltk.remote.task 48 | :members: 49 | :special-members: 50 | :exclude-members: __dict__, __weakref__, __init__ -------------------------------------------------------------------------------- /docs/mod_similarity.rst: -------------------------------------------------------------------------------- 1 | Similarity 2 | ========== 3 | 4 | Normal metrics 5 | -------------- 6 | 7 | .. automodule:: rltk.similarity.equal 8 | :members: 9 | 10 | .. automodule:: rltk.similarity.hamming 11 | :members: 12 | 13 | .. automodule:: rltk.similarity.dice 14 | :members: 15 | 16 | .. automodule:: rltk.similarity.levenshtein 17 | :members: 18 | 19 | .. automodule:: rltk.similarity.needleman 20 | :members: 21 | 22 | .. automodule:: rltk.similarity.jaro 23 | :members: 24 | 25 | .. automodule:: rltk.similarity.jaccard 26 | :members: 27 | 28 | .. automodule:: rltk.similarity.cosine 29 | :members: 30 | 31 | .. automodule:: rltk.similarity.tf_idf 32 | :members: 33 | 34 | 35 | Hybrid metrics 36 | -------------- 37 | 38 | .. automodule:: rltk.similarity.hybrid 39 | :members: 40 | 41 | 42 | Phonetic metrics 43 | ---------------- 44 | 45 | .. automodule:: rltk.similarity.soundex 46 | :members: 47 | 48 | .. automodule:: rltk.similarity.metaphone 49 | :members: 50 | 51 | .. automodule:: rltk.similarity.nysiis 52 | :members: 53 | -------------------------------------------------------------------------------- /docs/mod_tokenizer.rst: -------------------------------------------------------------------------------- 1 | Tokenizer 2 | ========= 3 | 4 | .. automodule:: rltk.tokenizer 5 | :members: 6 | :special-members: 7 | :exclude-members: __dict__, __weakref__, __init__ -------------------------------------------------------------------------------- /docs/mod_utils.rst: -------------------------------------------------------------------------------- 1 | Utilities 2 | --------- 3 | 4 | .. automodule:: rltk.utils 5 | :members: 6 | -------------------------------------------------------------------------------- /docs/modules.rst: -------------------------------------------------------------------------------- 1 | API Reference 2 | ================= 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | mod_dataset.rst 8 | mod_record.rst 9 | mod_similarity.rst 10 | mod_evaluation.rst 11 | mod_blocking.rst 12 | mod_io.rst 13 | mod_utils.rst 14 | mod_cli.rst 15 | mod_remote.rst 16 | mod_tokenizer.rst 17 | -------------------------------------------------------------------------------- /docs/resources/dblp_scholar_gt.csv: -------------------------------------------------------------------------------- 1 | idDBLP,idScholar 2 | conf/vldb/RothS97,gIFZWp_iCmUJ 3 | conf/sigmod/HellersteinHW97,7dcgPQxHuCMJ 4 | journals/tods/StolboushkinT98,ZnWLup8HMkUJ 5 | journals/sigmod/BohmR94,wLNJcNvsulkJ 6 | journals/vldb/Sarawagi01,Xx6kw0tCeQIJ 7 | conf/sigmod/FernandezFKLS97,ek26aiEheesJ 8 | journals/sigmod/Libkin99,zSO1Y5W7WkwJ 9 | journals/sigmod/KappelR98,tbZ0J3HLI18J 10 | conf/sigmod/TatarinovVBSSZ02,0HlMHEPJRH4J 11 | conf/vldb/SistlaYH94,_jl3bN2QlE4J 12 | conf/vldb/PetrovicBJ03,nIuz3dc8yHMJ 13 | journals/sigmod/FlorescuLM98,zkbTv93Zp1UJ 14 | conf/sigmod/AdaliCPS96,rmtEGXAXHKIJ 15 | journals/tods/FranklinCL97,-iaSLKFHwUkJ 16 | conf/sigmod/MamoulisP99,RusJdYPDgQ4J 17 | conf/sigmod/CherniackZ96,c9Humx2-EMgJ 18 | conf/vldb/DarFJST96,xhle0bk7qsMJ 19 | journals/sigmod/Yang94,sHJ914nPZtUJ 20 | conf/sigmod/TatarinovIHW01,jfkafZcMjgIJ 21 | conf/sigmod/BreunigKKS01,LxyVmHubIfUJ 22 | conf/sigmod/MelnikRB03,wfjeWtEY2NcJ 23 | conf/sigmod/HernandezMHYHT01,wMa4fMryrt0J 24 | journals/vldb/BarbaraI95,bTYTn8VG5hIJ 25 | conf/vldb/MedianoCD94,qwjRkZuiMHsJ 26 | conf/vldb/DattaDTVRF01,rDObsYKVroMJ 27 | journals/sigmod/AtkinsonDJPS96,yfjKkIhvfXcJ 28 | conf/vldb/SrivastavaDJL96,WvvY_Ao19mAJ 29 | conf/vldb/DattaDTVRF01,8_5A88ESaQ0J 30 | journals/vldb/VerykiosME03,khdP4spNnPoJ 31 | conf/sigmod/SimmenSM96,XVP8s4K0Bg4J 32 | conf/sigmod/ZhangDWEMPMDR03,1hkVjoUg8hUJ 33 | journals/sigmod/AshishK97,3zbpGI3YqXUJ 34 | conf/vldb/RohmBSS02,YMcmy4FOXi8J 35 | journals/tods/GoldmanL94,VxMarpzwtzQJ 36 | conf/vldb/DeutschPT99,0aJOXauNqYIJ 37 | conf/sigmod/AdelbergGW97,XytbGy--8LMJ 38 | conf/vldb/RaghavanG01,9Wo54Wyh_X8J 39 | conf/vldb/AmbiteKKMST01,F4DtzxvVZnoJ 40 | journals/tods/LitwinN96,l0W27c1C3NwJ 41 | journals/sigmod/DogacDKOONEHAKKM95,oAO74aolStoJ 42 | journals/sigmod/DiazJPa94,sG7PCEiN2xAJ 43 | conf/sigmod/MogiK96,G-ggEZEKjT8J 44 | conf/sigmod/HanKS97,XFCkL9QhTjIJ 45 | conf/vldb/Raghavan96,zuVOWDbv0lsJ 46 | conf/sigmod/LitwinS00,Ad5NAPgWIIUJ 47 | conf/sigmod/ZhouWGGZWXYF03,AwFxLUGiceUJ 48 | conf/vldb/Brin95,jXvsW6VxbMYJ 49 | conf/vldb/ShanmugasundaramGTZDN99,kaq5eLrzrQsJ 50 | journals/tods/FernandezKSMT02,ckrgSn0vBOMJ 51 | conf/sigmod/Kramer97,noTo81QxmHQJ 52 | conf/sigmod/BerchtoldK98,VuMPPr6k95AJ 53 | journals/vldb/LiR99,Ko9e8CH2Si4J 54 | journals/sigmod/GrayG97,ruMkEFagTIUJ 55 | conf/sigmod/LeeKOTM00,C8RLbWKCgicJ 56 | conf/sigmod/GibbonsM98,x4HkJDEYFmYJ 57 | conf/sigmod/AshleyFHLNP95,9uxj2XzGt9UJ 58 | conf/sigmod/GardarinMP95,IOqPoq2MSvQJ 59 | conf/sigmod/Brown99,itP4yy9sLUUJ 60 | conf/sigmod/AcharyaGPR99a,IkNOhDqEY18J 61 | conf/sigmod/HuangSW94,xF8s5N7oUIMJ 62 | journals/tods/GuoSW96,V8Ls_TYs6mgJ 63 | conf/vldb/CeriM03,G2QuI5QYYMoJ 64 | conf/vldb/ChristodoulakisTZ97,D0z0BDnbnFcJ 65 | conf/sigmod/HammerGNYBV97,2rysKgS6lugJ 66 | conf/sigmod/BhattacharyaMBNHS02,cg01BqanXhUJ 67 | journals/sigmod/SouzaS99,OmYc0wE1j4kJ 68 | conf/vldb/WangC03,nXj_8Y7lmHsJ 69 | journals/sigmod/Baeza-YatesN96,cvchvz9-m_oJ 70 | conf/vldb/CosleyLP02,BTalXWt3faUJ 71 | conf/vldb/RothS97,DwwSuaisX5QJ 72 | conf/sigmod/GriffinL95,6QZGeKna5lgJ 73 | conf/sigmod/SrikantA96,tTQpdbZYZGoJ 74 | conf/sigmod/CranorGJSS02,Up9QQITiHzAJ 75 | journals/sigmod/HasanFV96,vIBvTfMLL4UJ 76 | journals/sigmod/BunemanRU97,5qg4BNiroqMJ 77 | journals/vldb/DanYC95,SsOLRJZrmtYJ 78 | journals/tods/CliffordDIJS97,f1wgD54UUKwJ 79 | conf/vldb/CareyD96,g8jnRVyQukAJ 80 | conf/sigmod/NybergBCGL94,fXziEl_Htv8J 81 | journals/vldb/HarrisR96,S8x6zjXc9oAJ 82 | journals/sigmod/HummerLW02,W1IcM8IUwAEJ 83 | journals/sigmod/DarwenD95,EXwe9r79qxEJ 84 | conf/sigmod/HungYK96,bTI28RjBpPwJ 85 | journals/sigmod/TatarinovIMHSDDKMM03,rrSZOoViGqoJ 86 | journals/vldb/LeeSC01,Hz_TU6kUj08J 87 | conf/sigmod/SagonasSW94a,xc8KEoeRT9sJ 88 | conf/vldb/ChakravarthyKAK94,3M_0Kd8NNjgJ 89 | conf/sigmod/LometW98,F2ecYx97F2sJ 90 | conf/vldb/AgrawalS94,cIJQ0qxrkMIJ 91 | journals/vldb/RahmB01,707RPzHAB8YJ 92 | conf/vldb/Hammond96,LDxiVdRU6EEJ 93 | journals/sigmod/SilberschatzSU96,soiN2U4tXykJ 94 | journals/sigmod/PourabbasR00,3y6y5AXu-j0J 95 | journals/vldb/PottingerH01,mpnnRdFUUJQJ 96 | conf/sigmod/MankuRL99,X94gDE70Zn0J 97 | conf/vldb/MeccaCM01,Ph7ZpmdNOPEJ 98 | -------------------------------------------------------------------------------- /docs/scaling_and_optimization.rst: -------------------------------------------------------------------------------- 1 | Scaling and Optimization 2 | ======================== 3 | 4 | One important feature of RLTK is scalability. It can either work with very limited resources or utilize large amount of resources. 5 | 6 | Set proper arguments 7 | -------------------- 8 | 9 | Some of the methods have optional / required arguments about buffer size, chunk size, queue size, etc. Giving them proper values according to your machine's specification can reduce a lot of unnecessary memory-disk swap operations. 10 | 11 | Parallel processing 12 | ------------------- 13 | 14 | Here you need to use a package called `pyrallel `_. 15 | 16 | General parallel processing 17 | ``````````````````````````` 18 | 19 | If you have some compute-intensive procedures and your machine has more than one CPU core, `pyrallel.ParallelProcessor` is a tool to try. You can find more detailed information in its API documentation, but in general, it encapsulates multiprocessing to do parallel computing and multithreading to do data collecting. 20 | 21 | .. code-block:: python 22 | 23 | result = [] 24 | 25 | def heavy_calculation(x, y): 26 | return x * x, y + 5 27 | 28 | def output_handler(r1, r2): 29 | result.append(r1 if r1 > r2 else r2) 30 | 31 | pp = pyrallel.ParallelProcessor(8, mapper=heavy_calculation, collector=output_handler) 32 | pp.start() 33 | 34 | for i in range(8): 35 | pp.add_task(i, i + 1) 36 | 37 | pp.task_done() 38 | pp.join() 39 | 40 | print(result) 41 | 42 | 43 | MapReduce 44 | ````````` 45 | 46 | The above solution uses one thread (in main process) for collecting calculated data. If you want to do something like divide and conquer, especially when "conquer" needs heavy calculation, you may need `pyrallel.MapReduce` module. 47 | 48 | .. code-block:: python 49 | 50 | def mapper(x): 51 | time.sleep(0.0001) 52 | return x 53 | 54 | def reducer(r1, r2): 55 | return r1 + r2 56 | 57 | mr = pyrallel.MapReduce(8, mapper, reducer) 58 | for i in range(10000): 59 | mr.add_task(i) 60 | 61 | mr.task_done() 62 | result = mr.join() 63 | print(result) 64 | 65 | Distributed computing (Experimental) 66 | ------------------------------------ 67 | 68 | .. note:: 69 | 70 | It's not true that running RLTK on one machine is slower than on cluster, performance depends on requirement, data and code. If you only have tiny datasets and light task, Parallel computing is also not needed, creating processes and thread context switching all have costs. Similarly, distributed computing has more cost on IO (especially network) and it's more hard to do debugging, use it when you really need it. For most of the time, refactor code may have a boosting effect. 71 | 72 | If you have an extremely heavy computation work or very large datasets, and you also have multiple idle machines, you may consider to use distributed computing. More detailed usage is in API documentation :doc:`mod_remote`. 73 | 74 | First you need to set up a cluster. Cluster is formed by one scheduler and a bunch of workers. 75 | 76 | To start a scheduler, do 77 | 78 | .. code-block:: bash 79 | 80 | python -m rltk remote.scheduler 81 | 82 | Then on worker machines, do 83 | 84 | .. code-block:: bash 85 | 86 | python -m rltk remote.worker :8786 --nprocs 87 | 88 | Second, change a bit of your code and run it. The API for distributed computing is really like `pyrallel.ParallelProcessor`. But you need a `rltk.remote.Remote` object which connects to the scheduler and an instance of `rltk.remote.Task` which has a input and a output handler. 89 | 90 | .. code-block:: python 91 | 92 | def input_handler(r1, r2): 93 | return r1, r2, is_pair(r1, r2) 94 | 95 | def output_handler(r1, r2, label): 96 | print(r1.id, r2.id, label) 97 | 98 | remote = rltk.remote.Remote('127.0.0.1:8786') 99 | task = rltk.remote.Task(remote, input_handler=input_handler, output_handler=output_handler) 100 | task.start() 101 | 102 | for r1, r2 in rltk.get_record_pairs(ds1, ds2): 103 | task.compute(r1, r2) 104 | 105 | task.task_done() 106 | task.join() 107 | 108 | If data is in shared data store (file systems or services), there's no need to transfer record data through scheduler to worker but record id. Then workers can get data directly from data store. So change your code to make `input_handler` accepts id as input and fetch the record data in it. 109 | 110 | .. code-block:: python 111 | :emphasize-lines: 1,2,9 112 | 113 | def input_handler(id1, id2): 114 | r1, r2 = ds1.get(id1), ds2.get(id2) 115 | return is_pair(r1, r2) 116 | 117 | task = rltk.remote.Task(remote, input_handler=input_handler, output_handler=output_handler) 118 | task.start() 119 | 120 | for r1, r2 in rltk.get_record_pairs(ds1, ds2): 121 | task.compute(r1.id, r2.id) 122 | 123 | task.task_done() 124 | task.join() 125 | -------------------------------------------------------------------------------- /examples/basic/auto_record.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import rltk 3 | 4 | print('from dataframe...') 5 | 6 | df = pd.read_csv('ds1.csv', encoding='latin-1') 7 | df['id'] = df['doc_id'].astype('str') 8 | 9 | 10 | class DFRecord(rltk.AutoGeneratedRecord): 11 | pass 12 | 13 | 14 | ds = rltk.Dataset(rltk.DataFrameReader(df), record_class=DFRecord) 15 | for r in ds: 16 | print(r.id, r.doc_id, r.doc_value) 17 | 18 | 19 | print('set id column...') 20 | 21 | 22 | @rltk.set_id('col1', function_=lambda x: str(x), keep_original=True) 23 | class DFRecord2(rltk.AutoGeneratedRecord): 24 | pass 25 | 26 | 27 | df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]}) 28 | ds = rltk.Dataset(reader=rltk.DataFrameReader(df), record_class=DFRecord2) 29 | for r in ds: 30 | print(r.id, r.col1, r.col2) 31 | -------------------------------------------------------------------------------- /examples/basic/basic.py: -------------------------------------------------------------------------------- 1 | import rltk 2 | 3 | 4 | class Record1(rltk.Record): 5 | @property 6 | def id(self): 7 | return self.raw_object['doc_id'] 8 | 9 | @property 10 | def value(self): 11 | return self.raw_object['doc_value'] 12 | 13 | @property 14 | def parent_id(self): 15 | return '4' if self.id == '1' else None 16 | 17 | 18 | class Record2(rltk.Record): 19 | @rltk.cached_property 20 | def id(self): 21 | return self.raw_object['ident'] 22 | 23 | @rltk.cached_property 24 | def value(self): 25 | v = self.raw_object.get('values', list()) 26 | return v[0] if len(v) > 0 else 'empty' 27 | 28 | ds1 = rltk.Dataset(reader=rltk.CSVReader('ds1.csv'), 29 | record_class=Record1, adapter=rltk.MemoryKeyValueAdapter()) 30 | ds2 = rltk.Dataset(reader=rltk.JsonLinesReader('ds2.jl'), 31 | record_class=Record2, adapter=rltk.DbmKeyValueAdapter('file_index')) 32 | 33 | pairs = rltk.get_record_pairs(ds1, ds2) 34 | for r1, r2 in pairs: 35 | print('-------------') 36 | print(r1.id, r1.value, '\t', r2.id, r2.value) 37 | if r1.parent_id: 38 | print('r1\'s parent', r1.parent_id, ds1.get_record(r1.parent_id).value) 39 | print('levenshtein_distance:', rltk.levenshtein_distance(r1.value, r2.value)) 40 | print('levenshtein_similarity:', rltk.levenshtein_similarity(r1.value, r2.value)) 41 | -------------------------------------------------------------------------------- /examples/basic/cached_record.py: -------------------------------------------------------------------------------- 1 | import rltk 2 | 3 | 4 | @rltk.remove_raw_object 5 | class Record1(rltk.Record): 6 | 7 | @rltk.cached_property 8 | def id(self): 9 | print('--> compute id:', self.raw_object['doc_id']) 10 | return self.raw_object['doc_id'] 11 | 12 | @rltk.cached_property 13 | def value(self): 14 | print('--> compute value:', self.raw_object['doc_value']) 15 | return self.raw_object['doc_value'] 16 | 17 | @property 18 | def id_and_value(self): 19 | print('--> compute id_and_value') 20 | return self.id + '-' + self.value 21 | 22 | 23 | arr = [ 24 | {'doc_id': '1', 'doc_value': 'a'}, 25 | {'doc_id': '2', 'doc_value': 'b'}, 26 | {'doc_id': '3', 'doc_value': 'c'} 27 | ] 28 | # adapter = rltk.RedisKeyValueAdapter(host='127.0.0.1', key_prefix='cached_) 29 | adapter = rltk.HBaseKeyValueAdapter(host='127.0.0.1', key_prefix='test_', table='rltk_test1') 30 | ds1 = rltk.Dataset(reader=rltk.ArrayReader(arr), record_class=Record1, adapter=adapter) 31 | for r1 in ds1: 32 | print('------------') 33 | print('id:', r1.id) 34 | print('value:', r1.value) 35 | print('id_and_value:', r1.id_and_value) 36 | print('cache in dict:', r1.__dict__) 37 | -------------------------------------------------------------------------------- /examples/basic/dedup.py: -------------------------------------------------------------------------------- 1 | import rltk 2 | 3 | 4 | raw_inputs = [ 5 | {'name': 'a1', 'age': 10, 'id': 1}, 6 | {'name': 'a2', 'age': 20, 'id': 2}, 7 | {'name': 'a3', 'age': 30, 'id': 3}, 8 | {'name': 'a3', 'age': 30, 'id': 4}, 9 | {'name': 'a1', 'age': 10, 'id': 5}, 10 | ] 11 | 12 | 13 | class MyRecord(rltk.Record): 14 | 15 | @property 16 | def id(self): 17 | return str(self.raw_object['id']) 18 | 19 | @property 20 | def name(self): 21 | return self.raw_object['name'] 22 | 23 | @property 24 | def age(self): 25 | return self.raw_object['age'] 26 | 27 | 28 | ds = rltk.Dataset(reader=rltk.ArrayReader(raw_inputs), record_class=MyRecord) 29 | for r, r_ in rltk.get_record_pairs(ds): 30 | print('comparing', r.id, r_.id, r.name == r_.name and r.age == r_.age) 31 | -------------------------------------------------------------------------------- /examples/basic/ds1.csv: -------------------------------------------------------------------------------- 1 | doc_id,doc_value 2 | 1,hello 3 | 2,world 4 | 3,foo 5 | 4,bar -------------------------------------------------------------------------------- /examples/basic/ds2.jl: -------------------------------------------------------------------------------- 1 | {"ident": "a", "values":["a1"]} 2 | {"ident": "b", "values":["b1", "b2"]} 3 | {"ident": "c", "values":["c1"]} 4 | 5 | {"ident": "d"} 6 | -------------------------------------------------------------------------------- /examples/blocking/block_io_operations.py: -------------------------------------------------------------------------------- 1 | import rltk 2 | 3 | b1 = rltk.Block() 4 | b1.add('001', '1', '1') 5 | b1.add('001', '2', 'a') 6 | b1.add('002', '1', '2') 7 | b1.add('002', '2', 'b') 8 | b1.add('002', '2', 'c') 9 | print('--- block1 ---') 10 | for bb in b1: 11 | print(bb) 12 | 13 | b2 = rltk.Block() 14 | b2.add('001', '1', '1') 15 | b2.add('001', '2', 'a') 16 | b2.add('001', '2', 'd') 17 | b2.add('002', '1', '1') 18 | b2.add('002', '2', 'c') 19 | b2.add('002', '3', 'k') 20 | print('--- block2 (pairwise) ---') 21 | for bb in b2.pairwise('1', '2'): 22 | print(bb) 23 | print('--- block2 (pairwise, single dataset) ---') 24 | for bb in b2.pairwise('2'): 25 | print(bb) 26 | 27 | b1_inverted = rltk.BlockingHelper.generate_inverted_indices(b1) 28 | b2_inverted = rltk.BlockingHelper.generate_inverted_indices(b2) 29 | b3 = rltk.BlockingHelper.union(b1, b1_inverted, b2, b2_inverted) 30 | print('--- union ---') 31 | for bb in b3: 32 | print(bb) 33 | print('--- union raw ---') 34 | for rr in b3.key_set_adapter: 35 | print(rr) 36 | 37 | b4 = rltk.BlockingHelper.intersect(b1, b1_inverted, b2, b2_inverted) 38 | print('--- intersect --') 39 | for bb in b4: 40 | print(bb) 41 | print('--- intersect raw --') 42 | for rr in b4.key_set_adapter: 43 | print(rr) 44 | -------------------------------------------------------------------------------- /examples/blocking/canopy.py: -------------------------------------------------------------------------------- 1 | import rltk 2 | import math 3 | 4 | 5 | @rltk.remove_raw_object 6 | class Record1(rltk.Record): 7 | 8 | @rltk.cached_property 9 | def id(self): 10 | return self.raw_object['doc_id'] 11 | 12 | @rltk.cached_property 13 | def first_name(self): 14 | return self.raw_object['first name'] 15 | 16 | @rltk.cached_property 17 | def last_name(self): 18 | return self.raw_object['last name'] 19 | 20 | @property 21 | def full_name(self): 22 | return self.first_name + ' ' + self.last_name 23 | 24 | 25 | @rltk.remove_raw_object 26 | class Record2(rltk.Record): 27 | 28 | @rltk.cached_property 29 | def id(self): 30 | return self.raw_object['ident'] 31 | 32 | @rltk.cached_property 33 | def first_name(self): 34 | return self.raw_object['name'].split(' ')[0] 35 | 36 | @rltk.cached_property 37 | def last_name(self): 38 | return self.raw_object['name'].split(' ')[1] 39 | 40 | @property 41 | def full_name(self): 42 | return self.first_name + ' ' + self.last_name 43 | 44 | ds1 = rltk.Dataset(reader=rltk.CSVReader('ds1.csv', delimiter=','), record_class=Record1) 45 | ds2 = rltk.Dataset(reader=rltk.JsonLinesReader('ds2.jl'), record_class=Record2) 46 | 47 | # for r in ds1: 48 | # print(r.first_name) 49 | # for r in ds2: 50 | # print(r.first_name) 51 | 52 | 53 | def vectorize(r): 54 | return [ord(r.first_name[0].lower()) - 0x61, 2] 55 | 56 | 57 | def distance_metric(vec1, vec2): 58 | vec1, vec2 = float(vec1[0]), float(vec2[0]) 59 | return math.sqrt((vec1 - vec2) ** 2) 60 | 61 | bg = rltk.CanopyBlockGenerator(t1=10, t2=5, distance_metric=distance_metric) 62 | block = bg.generate( 63 | bg.block(ds1, function_=vectorize), 64 | bg.block(ds2, function_=vectorize)) 65 | pairs = rltk.get_record_pairs(ds1, ds2, block=block) 66 | for r1, r2 in pairs: 67 | print(r1.id, r1.full_name, '\t', r2.id, r2.full_name) 68 | -------------------------------------------------------------------------------- /examples/blocking/ds1.csv: -------------------------------------------------------------------------------- 1 | doc_id,first name,last name 2 | 1,alice, A 3 | 2,bob, A 4 | 3,craig, B 5 | 4,david, C -------------------------------------------------------------------------------- /examples/blocking/ds2.jl: -------------------------------------------------------------------------------- 1 | {"ident": "a", "name":"alien wong"} 2 | {"ident": "b", "name":"bob lee"} 3 | {"ident": "c", "name":"deck knight"} 4 | {"ident": "d", "name":"joe martin"} 5 | -------------------------------------------------------------------------------- /examples/blocking/generate_blocks.py: -------------------------------------------------------------------------------- 1 | import rltk 2 | 3 | 4 | @rltk.remove_raw_object 5 | class Record1(rltk.Record): 6 | 7 | @rltk.cached_property 8 | def id(self): 9 | return self.raw_object['doc_id'] 10 | 11 | @rltk.cached_property 12 | def first_name(self): 13 | return self.raw_object['first name'] 14 | 15 | @rltk.cached_property 16 | def last_name(self): 17 | return self.raw_object['last name'] 18 | 19 | 20 | @rltk.remove_raw_object 21 | class Record2(rltk.Record): 22 | 23 | @rltk.cached_property 24 | def id(self): 25 | return self.raw_object['ident'] 26 | 27 | @rltk.cached_property 28 | def first_name(self): 29 | return self.raw_object['name'].split(' ')[0] 30 | 31 | @rltk.cached_property 32 | def last_name(self): 33 | return self.raw_object['name'].split(' ')[1] 34 | 35 | 36 | ds1 = rltk.Dataset(reader=rltk.CSVReader('ds1.csv', delimiter=','), 37 | record_class=Record1, adapter=rltk.MemoryKeyValueAdapter()) 38 | ds2 = rltk.Dataset(reader=rltk.JsonLinesReader('ds2.jl'), 39 | record_class=Record2, adapter=rltk.MemoryKeyValueAdapter()) 40 | 41 | print('--- block on first_name ---') 42 | bg = rltk.HashBlockGenerator() 43 | block = bg.generate(bg.block(ds1, property_='first_name'), 44 | bg.block(ds2, property_='first_name')) 45 | 46 | pairs = rltk.get_record_pairs(ds1, ds2, block=block) 47 | for r1, r2 in pairs: 48 | print(r1.id, r1.first_name, '\t', r2.id, r2.first_name) 49 | 50 | 51 | print('--- block on first_name[:1] ---') 52 | bg2 = rltk.HashBlockGenerator() 53 | block2 = bg2.generate( 54 | bg2.block(ds1, function_=lambda r: r.first_name[:1]), 55 | bg2.block(ds2, function_=lambda r: r.first_name[:1])) 56 | 57 | pairs = rltk.get_record_pairs(ds1, ds2, block=block2) 58 | for r1, r2 in pairs: 59 | print(r1.id, r1.first_name, '\t', r2.id, r2.first_name) 60 | 61 | 62 | print('--- block on first_name[:3] based on previous blocks ---') 63 | bg3 = rltk.HashBlockGenerator() 64 | block3 = bg3.generate( 65 | bg3.block(ds1, function_=lambda r: r.first_name[:3], base_on=block2), 66 | bg3.block(ds2, function_=lambda r: r.first_name[:3], base_on=block2)) 67 | pairs = rltk.get_record_pairs(ds1, ds2, block=block3) 68 | for r1, r2 in pairs: 69 | print(r1.id, r1.first_name, '\t', r2.id, r2.first_name) 70 | print('inside blocks:') 71 | for b, d, r in block3: 72 | print(b, d, r) 73 | -------------------------------------------------------------------------------- /examples/blocking/inverted_index.py: -------------------------------------------------------------------------------- 1 | import rltk 2 | 3 | 4 | @rltk.remove_raw_object 5 | class Record1(rltk.Record): 6 | 7 | @rltk.cached_property 8 | def id(self): 9 | return self.raw_object['doc_id'] 10 | 11 | @rltk.cached_property 12 | def first_name(self): 13 | return self.raw_object['first name'] 14 | 15 | @rltk.cached_property 16 | def last_name(self): 17 | return self.raw_object['last name'] 18 | 19 | @property 20 | def full_name(self): 21 | return self.first_name + ' ' + self.last_name 22 | 23 | 24 | @rltk.remove_raw_object 25 | class Record2(rltk.Record): 26 | 27 | @rltk.cached_property 28 | def id(self): 29 | return self.raw_object['ident'] 30 | 31 | @rltk.cached_property 32 | def first_name(self): 33 | return self.raw_object['name'].split(' ')[0] 34 | 35 | @rltk.cached_property 36 | def last_name(self): 37 | return self.raw_object['name'].split(' ')[1] 38 | 39 | @property 40 | def full_name(self): 41 | return self.first_name + ' ' + self.last_name 42 | 43 | ds1 = rltk.Dataset(reader=rltk.CSVReader('ds1.csv', delimiter=','), record_class=Record1) 44 | ds2 = rltk.Dataset(reader=rltk.JsonLinesReader('ds2.jl'), record_class=Record2) 45 | 46 | ngram = rltk.NGramTokenizer() 47 | 48 | bg = rltk.TokenBlockGenerator() 49 | block1 = bg.block(ds1, function_=lambda r: ngram.basic(r.first_name, 3), 50 | block=rltk.Block(rltk.LevelDbKeySetAdapter('block_store', 'b1', clean=True))) 51 | block2 = bg.block(ds2, function_=lambda r: ngram.basic(r.first_name, 3), 52 | block=rltk.Block(rltk.LevelDbKeySetAdapter('block_store', 'b2', clean=True))) 53 | block3 = bg.generate(block1, block2, rltk.Block(rltk.LevelDbKeySetAdapter('block_store', 'b3', clean=True))) 54 | pairs = rltk.get_record_pairs(ds1, ds2, block=block3) 55 | for r1, r2 in pairs: 56 | print(r1.id, r1.full_name, '\t', r2.id, r2.full_name) 57 | -------------------------------------------------------------------------------- /examples/evaluation/.gitignore: -------------------------------------------------------------------------------- 1 | saved_ground_truth.csv -------------------------------------------------------------------------------- /examples/evaluation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/usc-isi-i2/rltk/aee10ed5dd561583e60db3373ed82fe1208da1e9/examples/evaluation/__init__.py -------------------------------------------------------------------------------- /examples/evaluation/construct_datasets.py: -------------------------------------------------------------------------------- 1 | import rltk 2 | 3 | 4 | @rltk.remove_raw_object 5 | class EvaluationRecord(rltk.Record): 6 | @rltk.cached_property 7 | def id(self): 8 | return self.raw_object['id'] 9 | 10 | @rltk.cached_property 11 | def name(self): 12 | return self.raw_object['name'] 13 | 14 | @rltk.cached_property 15 | def laptop(self): 16 | return self.raw_object['laptop_brand'] 17 | 18 | 19 | @rltk.remove_raw_object 20 | class EvaluationRecord2(rltk.Record): 21 | @rltk.cached_property 22 | def id(self): 23 | return self.raw_object['id'] 24 | 25 | @rltk.cached_property 26 | def name(self): 27 | return self.raw_object['name'] 28 | 29 | @rltk.cached_property 30 | def laptop(self): 31 | return self.raw_object['laptop'] 32 | 33 | 34 | dataset_1_file_name = 'data_1.csv' 35 | dataset_2_file_name = 'data_2.csv' 36 | 37 | ds1 = rltk.Dataset(reader=rltk.CSVReader(dataset_1_file_name), 38 | record_class=EvaluationRecord) 39 | ds2 = rltk.Dataset(reader=rltk.CSVReader(dataset_2_file_name), 40 | record_class=EvaluationRecord2) 41 | -------------------------------------------------------------------------------- /examples/evaluation/data_1.csv: -------------------------------------------------------------------------------- 1 | id,name,laptop_brand 2 | 0,Jerry Li,Alienware 3 | 1,Jeremy Yin,Apple 4 | 2,Jack Liu,HP 5 | 3,John Xi,Apple 6 | -------------------------------------------------------------------------------- /examples/evaluation/data_2.csv: -------------------------------------------------------------------------------- 1 | id,name,laptop 2 | 10,jerry,Alienware 6 3 | 11,jeremy,Macbook Pro 4 | 12,jack,HP Envy X360 5 | 13,john,Macbook Pro -------------------------------------------------------------------------------- /examples/evaluation/generate_negative_gt.py: -------------------------------------------------------------------------------- 1 | from construct_datasets import * 2 | 3 | 4 | print('generate negatives') 5 | gt = rltk.GroundTruth() 6 | gt.load('gt_positive_only.csv') 7 | 8 | 9 | def score_function(r1, r2): 10 | return rltk.levenshtein_similarity(r1.name, r2.name) 11 | 12 | gt.generate_negatives(ds1, ds2, score_function=score_function) 13 | 14 | for id1, id2, label in gt: 15 | print(id1, id2, label) 16 | 17 | 18 | print('generate all negatives') 19 | gt1 = rltk.GroundTruth() 20 | gt1.load('gt_positive_only.csv') 21 | 22 | gt1.generate_all_negatives(ds1, ds2) 23 | for id1, id2, label in gt1: 24 | print(id1, id2, label) 25 | 26 | 27 | print('generate stratified negatives') 28 | gt2 = rltk.GroundTruth() 29 | gt2.load('gt_positive_only.csv') 30 | 31 | 32 | num_of_cluster = 3 33 | curr = -1 34 | 35 | 36 | def classify(r1, f2): 37 | global curr 38 | curr = (curr + 1) % num_of_cluster 39 | return curr 40 | 41 | gt2.generate_stratified_negatives(ds1, ds2, classify, num_of_cluster) 42 | for id1, id2, label in gt2: 43 | print(id1, id2, label) -------------------------------------------------------------------------------- /examples/evaluation/ground_truth.csv: -------------------------------------------------------------------------------- 1 | id1,id2,label 2 | 0,10,True 3 | 0,11,False 4 | 0,12,False 5 | 0,13,False 6 | 1,10,False 7 | 1,11,True 8 | 1,12,False 9 | 1,13,False 10 | 2,10,False 11 | 2,11,False 12 | 2,12,True 13 | 2,13,False 14 | 3,10,False 15 | 3,11,False 16 | 3,12,False 17 | -------------------------------------------------------------------------------- /examples/evaluation/gt_positive_only.csv: -------------------------------------------------------------------------------- 1 | id1,id2,label 2 | 0,11,True 3 | 1,13,True 4 | -------------------------------------------------------------------------------- /examples/evaluation/run_evaluation.py: -------------------------------------------------------------------------------- 1 | from construct_datasets import * 2 | 3 | saved_ground_truth_file_name = 'ground_truth.csv' 4 | gt = rltk.GroundTruth() 5 | gt.load(saved_ground_truth_file_name) 6 | 7 | gt.add_ground_truth('3', '13', True) 8 | gt.save('saved_' + saved_ground_truth_file_name) 9 | 10 | eva = rltk.Evaluation() 11 | 12 | for min_confidence_100 in range(0, 100): 13 | threshold = min_confidence_100 / 100 14 | trial = rltk.Trial(gt, min_confidence=0, top_k=0, 15 | label='min threshold is: {}'.format(threshold), threshold=threshold) 16 | pairs = rltk.get_record_pairs(ds1, ds2) 17 | for r1, r2 in pairs: 18 | c = 0.3 * rltk.levenshtein_similarity(r1.name, r2.name) + 0.7 * rltk.levenshtein_similarity(r1.laptop, r2.laptop) 19 | p = (c >= threshold) 20 | trial.add_result(r1, r2, p, c) 21 | 22 | trial.evaluate() 23 | eva.add_trial(trial) 24 | 25 | # coord = [ 26 | # { 27 | # 'x': 'threshold', 28 | # 'y': 'false_positives', 29 | # 'label': '123' 30 | # }, 31 | # { 32 | # 'x': 'threshold', 33 | # 'y': 'true_positives', 34 | # 'label': '456', 35 | # 'linestyle': '--' 36 | # }, 37 | # { 38 | # 'x': 'recall', 39 | # 'y': 'precision', 40 | # 'label': 'pr', 41 | # 'linestyle': '--' 42 | # } 43 | # ] 44 | # eva.plot(coord) 45 | eva.plot_precision_recall().show() 46 | 47 | eva.plot_roc().show() 48 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Cython>=0.28.0 2 | numpy>=1.17.0 3 | pandas>=1.2.0 4 | scipy>=1.1.0 5 | matplotlib>=2.0.0 6 | dask>=0.19.2 7 | distributed>=1.23 8 | pyrallel.lib 9 | -------------------------------------------------------------------------------- /requirements_dev.txt: -------------------------------------------------------------------------------- 1 | redis>=2.0.0 2 | happybase>=1.1.0 3 | plyvel>=1.0.5 4 | pytest 5 | pytest-cov<2.6 6 | 7 | -------------------------------------------------------------------------------- /requirements_docs.txt: -------------------------------------------------------------------------------- 1 | -r requirements.txt 2 | -r requirements_dev.txt 3 | alabaster>=0.7.9 4 | Sphinx>=1.5.6 5 | sphinx-autobuild>=0.6.0 6 | sphinxcontrib-napoleon>=0.6.0 7 | nbsphinx>=0.3.4 8 | pandoc>=1.0.2 9 | tornado==4.5.3 10 | jupyter>=1.0.0 11 | -------------------------------------------------------------------------------- /rltk/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = '2.0.0-a020' 2 | 3 | from rltk.record import Record, AutoGeneratedRecord,\ 4 | cached_property, generate_record_property_cache, validate_record, remove_raw_object, set_id 5 | from rltk.dataset import Dataset 6 | from rltk.io import * 7 | from rltk.similarity import * 8 | from rltk.blocking import * 9 | from rltk.tokenizer import * 10 | from rltk.evaluation import * 11 | from rltk.utils import candidate_pairs, get_record_pairs 12 | import rltk.cli 13 | import rltk.remote 14 | -------------------------------------------------------------------------------- /rltk/__main__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import tempfile 4 | import logging 5 | 6 | from distributed.cli import dask_scheduler, dask_worker 7 | 8 | 9 | def help_info(): 10 | print('Available commands:') 11 | print('remote.worker, remote.scheduler') 12 | 13 | 14 | if __name__ == '__main__': 15 | if len(sys.argv) <= 1: 16 | print('No command\n') 17 | help_info() 18 | sys.exit() 19 | 20 | cmd = sys.argv[1] 21 | sub_cmd = sys.argv[2:] if len(sys.argv) >= 3 else [] 22 | sys.argv.pop(1) 23 | 24 | if cmd in ('help', '--help', 'h', '-h'): 25 | help_info() 26 | sys.exit() 27 | 28 | sys.argv[0] = cmd # replace prog name 29 | temp_path = os.path.join(tempfile.gettempdir(), 'rltk', 'remote') 30 | if not os.path.exists(temp_path): 31 | os.makedirs(temp_path, exist_ok=True) 32 | if cmd == 'remote.worker': 33 | logger = logging.getLogger('distributed.dask_worker') 34 | logger.setLevel(logging.ERROR) 35 | sys.argv.append('--local-directory') 36 | sys.argv.append(temp_path) 37 | # sys.argv.append('--change-directory') 38 | sys.exit(dask_worker.go()) 39 | elif cmd == 'remote.scheduler': 40 | logger = logging.getLogger('distributed.scheduler') 41 | logger.setLevel(logging.ERROR) 42 | sys.argv.append('--local-directory') 43 | sys.argv.append(temp_path) 44 | sys.exit(dask_scheduler.go()) 45 | else: 46 | print('Unknown command\n') 47 | help_info() 48 | 49 | sys.exit() 50 | -------------------------------------------------------------------------------- /rltk/blocking/__init__.py: -------------------------------------------------------------------------------- 1 | from rltk.blocking.block import Block 2 | from rltk.blocking.block_black_list import BlockBlackList 3 | from rltk.blocking.block_generator import BlockGenerator 4 | from rltk.blocking.hash_block_generator import HashBlockGenerator 5 | from rltk.blocking.token_block_generator import TokenBlockGenerator 6 | from rltk.blocking.canopy_block_generator import CanopyBlockGenerator 7 | from rltk.blocking.sorted_neighbourhood_block_generator import SortedNeighbourhoodBlockGenerator 8 | from rltk.blocking.blocking_helper import BlockingHelper 9 | 10 | Blocker = BlockGenerator 11 | HashBlocker = HashBlockGenerator 12 | TokenBlocker = TokenBlockGenerator 13 | CanopyBlocker = CanopyBlockGenerator 14 | SortedNeighbourhoodBlocker = SortedNeighbourhoodBlockGenerator 15 | -------------------------------------------------------------------------------- /rltk/blocking/block.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | 3 | from rltk.io.adapter.key_set_adapter import KeySetAdapter 4 | from rltk.io.adapter.memory_key_set_adapter import MemoryKeySetAdapter 5 | from rltk.dataset import Dataset 6 | from rltk.record import Record 7 | 8 | 9 | class Block(object): 10 | """ 11 | Block 12 | 13 | key_set_adapter (keySetAdapter, optional): Where the block stores. If it's None, 14 | :meth:`MemoryKeySetAdapter` is used. Defaults to None. 15 | """ 16 | def __init__(self, key_set_adapter: KeySetAdapter = None): 17 | if not key_set_adapter: 18 | key_set_adapter = MemoryKeySetAdapter() 19 | self.key_set_adapter = key_set_adapter 20 | 21 | def add(self, block_id, dataset_id, record_id): 22 | """ 23 | Add to block. 24 | 25 | Args: 26 | block_id (str): Block id. 27 | dataset_id (str / Dataset): Dataset id or Dataset object. 28 | record_id (str / Record): Record id or Record object. 29 | """ 30 | if isinstance(dataset_id, Dataset): 31 | dataset_id = dataset_id.id 32 | if isinstance(record_id, Record): 33 | record_id = record_id.id 34 | self.key_set_adapter.add(block_id, (dataset_id, record_id)) 35 | 36 | def get(self, block_id): 37 | """ 38 | Get block by block_id. 39 | 40 | Args: 41 | block_id (str): Block id. 42 | 43 | Returns: 44 | set: {(dataset_id, record_id)} 45 | """ 46 | return self.key_set_adapter.get(block_id) 47 | 48 | def __iter__(self): 49 | """ 50 | Same as :meth:`__next__` 51 | """ 52 | return self.__next__() 53 | 54 | def __next__(self): 55 | """ 56 | Iterator of blocks. 57 | 58 | Returns: 59 | iter: block_id, dataset_id, record_id. 60 | """ 61 | for block_id, data in self.key_set_adapter: 62 | for dataset_id, record_id in data: 63 | yield block_id, dataset_id, record_id 64 | 65 | def pairwise(self, ds_id1: str, ds_id2: str = None): 66 | """ 67 | Iterator of id pairs generated according to blocks. 68 | 69 | Returns: 70 | iter: block_id, id1, id2. 71 | """ 72 | if isinstance(ds_id1, Dataset): 73 | ds_id1 = ds_id1.id 74 | if ds_id2 and isinstance(ds_id2, Dataset): 75 | ds_id2 = ds_id2.id 76 | 77 | if ds_id2: 78 | for block_id, data in self.key_set_adapter: 79 | # fetch one block 80 | ds1, ds2 = list(), list() 81 | for dataset_id, record_id in data: 82 | if dataset_id == ds_id1: 83 | ds1.append(record_id) 84 | elif dataset_id == ds_id2: 85 | ds2.append(record_id) 86 | 87 | # cross product 88 | for id1, id2 in itertools.product(ds1, ds2): 89 | yield block_id, id1, id2 90 | else: 91 | for block_id, data in self.key_set_adapter: 92 | # fetch one block 93 | ds1 = list() 94 | for dataset_id, record_id in data: 95 | if dataset_id == ds_id1: 96 | ds1.append(record_id) 97 | 98 | # combinations of two elements 99 | for ds1, ds1_ in itertools.combinations(ds1, 2): 100 | yield block_id, ds1, ds1_ 101 | -------------------------------------------------------------------------------- /rltk/blocking/block_black_list.py: -------------------------------------------------------------------------------- 1 | from rltk.io.adapter.key_set_adapter import KeySetAdapter 2 | from rltk.io.adapter.memory_key_set_adapter import MemoryKeySetAdapter 3 | from rltk.blocking.block import Block 4 | 5 | 6 | class BlockBlackList(object): 7 | """ 8 | Block black list 9 | 10 | Args: 11 | key_set_adapter (keySetAdapter, optional): Where the block stores. If it's None, 12 | :meth:`MemoryKeySetAdapter` is used. Defaults to None. 13 | max_size (int, optional): Maximum size of a block. Used by :meth:`add`. Defaults to 0. 14 | """ 15 | def __init__(self, key_set_adapter: KeySetAdapter = None, max_size: int = 0): 16 | if not key_set_adapter: 17 | key_set_adapter = MemoryKeySetAdapter() 18 | self.key_set_adapter = key_set_adapter 19 | self._max_size = max_size 20 | 21 | def has(self, block_id: str): 22 | """ 23 | Test if block_id is in black list. 24 | 25 | Args: 26 | block_id (str): Block id. 27 | """ 28 | return self.key_set_adapter.get(block_id) is not None 29 | 30 | def add(self, block_id: str, block: Block): 31 | """ 32 | Add block_id to black list and update block data. 33 | 34 | Args: 35 | block_id (str): Block id. 36 | block (Block): Block object. 37 | 38 | Notes: 39 | * If `max_size` is 0, then block_id will be added. 40 | * If `max_size` is greater than 0 and data in this block is more than this size, 41 | this block_id will be added to BlockBlackList and this block is removed from Block. 42 | """ 43 | if self._max_size > 0: 44 | d = block.key_set_adapter.get(block_id) 45 | if len(d) > self._max_size: 46 | self.key_set_adapter.set(block_id, set()) 47 | block.key_set_adapter.delete(block_id) 48 | else: 49 | self.key_set_adapter.set(block_id, set()) 50 | 51 | def __contains__(self, item): 52 | """ 53 | Same as :meth:`has` 54 | """ 55 | self.has(item) 56 | -------------------------------------------------------------------------------- /rltk/blocking/block_generator.py: -------------------------------------------------------------------------------- 1 | from typing import Callable, TYPE_CHECKING 2 | 3 | if TYPE_CHECKING: 4 | from rltk.dataset import Dataset 5 | from rltk.blocking.block import Block 6 | from rltk.blocking.block_black_list import BlockBlackList 7 | 8 | 9 | class BlockGenerator(object): 10 | """ 11 | Block generator. 12 | """ 13 | 14 | def block(self, dataset: 'Dataset', function_: Callable = None, property_: str = None, 15 | block: Block = None, block_black_list: BlockBlackList = None, base_on: Block = None): 16 | """ 17 | Block on property or by function for dataset. 18 | 19 | Args: 20 | dataset (Dataset): Dataset. 21 | function_ (Callable): `function_(r: record)`. 22 | property_ (str): The property in Record object. 23 | block (Block): Where to write blocks. If None, a new block will be created. Defaults to None. 24 | block_black_list (BlockBlackList, optional): Where all blacklisted blocks are stored. Defaults to None. 25 | base_on (Block, optional): Current block is generated base on this block. Defaults to None. 26 | 27 | Returns: 28 | Block: 29 | """ 30 | block = BlockGenerator._block_args_check(function_, property_, block) 31 | return block 32 | 33 | @staticmethod 34 | def _block_args_check(function_, property_, block): 35 | if not function_ and not property_: 36 | raise ValueError('Invalid function or property') 37 | return block or Block() 38 | 39 | def generate(self, block1: Block, block2: Block, output_block: Block = None): 40 | """ 41 | Generate block from two blocks of single dataset. 42 | 43 | Args: 44 | block1 (Block): Block 1. 45 | block2 (Block): Block 2. 46 | output_block (Block): Where the output block goes. If None, a new block will be created. Defaults to None. 47 | 48 | Returns: 49 | Block: 50 | """ 51 | block = BlockGenerator._generate_args_check(output_block) 52 | return block 53 | 54 | @staticmethod 55 | def _generate_args_check(block): 56 | return block or Block() 57 | -------------------------------------------------------------------------------- /rltk/blocking/blocking_helper.py: -------------------------------------------------------------------------------- 1 | import json 2 | import hashlib 3 | import operator 4 | 5 | from rltk.blocking.block import Block 6 | from rltk.io.adapter.key_set_adapter import KeySetAdapter 7 | from rltk.io.adapter.memory_key_set_adapter import MemoryKeySetAdapter 8 | 9 | 10 | class BlockingHelper(object): 11 | """ 12 | Blocking Helper. 13 | """ 14 | 15 | @staticmethod 16 | def encode_inverted_index_key(dataset_id, record_id): 17 | return json.dumps({'d': dataset_id, 'r': record_id}, sort_keys=True) 18 | 19 | @staticmethod 20 | def decode_inverted_index_key(key): 21 | key = json.loads(key) 22 | return key['d'], key['r'] 23 | 24 | @staticmethod 25 | def generate_inverted_indices(block: Block, ks_adapter: KeySetAdapter = None): 26 | """ 27 | Generate inverted indices of block. 28 | 29 | Args: 30 | block (Block): Original block. 31 | ks_adapter (KeySetAdapter): Where the inverted indices store. 32 | 33 | Returns: 34 | KeySetAdapter: 35 | """ 36 | if not ks_adapter: 37 | ks_adapter = MemoryKeySetAdapter() 38 | for block_id, dataset_id, record_id in block: 39 | ks_adapter.add(BlockingHelper.encode_inverted_index_key(dataset_id, record_id), block_id) 40 | return ks_adapter 41 | 42 | @staticmethod 43 | def _block_operations(operator_, left_block, right_block, right_inverted, output_block): 44 | operation = None 45 | if operator_ == 'union': 46 | operation = operator.or_ # lambda a, b: a | b 47 | elif operator_ == 'intersect': 48 | operation = operator.and_ # lambda a, b: a & b 49 | 50 | for left_block_id, left_data in left_block.key_set_adapter: 51 | for left_dataset_id, left_record_id in left_data: 52 | key = BlockingHelper.encode_inverted_index_key(left_dataset_id, left_record_id) 53 | right_block_ids = right_inverted.get(key) 54 | if right_block_ids: 55 | for right_block_id in right_block_ids: 56 | new_block_data = operation(left_data, right_block.get(right_block_id)) 57 | new_block_id = hashlib \ 58 | .sha1(''.join(sorted(['{},{}'.format(ds, r) for ds, r in new_block_data])) 59 | .encode('utf-8')).hexdigest() 60 | output_block.key_set_adapter.set(new_block_id, new_block_data) 61 | 62 | @staticmethod 63 | def union(block1, inverted1, block2, inverted2, block3=None): 64 | """ 65 | Union of two blocks. 66 | 67 | Args: 68 | block1 (Block): Block 1. 69 | inverted1 (KeySetAdapter): Inverted indices of block 1. 70 | block2 (Block): Block2. 71 | inverted2 (KeySetAdapter): Inverted indices of block 2. 72 | block3 (Block, optional): Unioned block. If None, a Block object will be created. Defaults to None. 73 | 74 | Returns: 75 | Block: 76 | """ 77 | block3 = block3 or Block() 78 | 79 | BlockingHelper._block_operations('union', block1, block2, inverted2, block3) 80 | BlockingHelper._block_operations('union', block2, block1, inverted1, block3) 81 | return block3 82 | 83 | @staticmethod 84 | def intersect(block1, inverted1, block2, inverted2, block3=None): 85 | """ 86 | Intersection of two blocks. 87 | 88 | Args: 89 | block1 (Block): Block 1. 90 | inverted1 (KeySetAdapter): Inverted indices of block 1. 91 | block2 (Block): Block2. 92 | inverted2 (KeySetAdapter): Inverted indices of block 2. 93 | block3 (Block, optional): Intersected block. If None, a Block object will be created. Defaults to None. 94 | 95 | Returns: 96 | Block: 97 | """ 98 | block3 = block3 or Block() 99 | 100 | BlockingHelper._block_operations('intersect', block1, block2, inverted2, block3) 101 | BlockingHelper._block_operations('intersect', block2, block1, inverted1, block3) 102 | return block3 103 | -------------------------------------------------------------------------------- /rltk/blocking/canopy_block_generator.py: -------------------------------------------------------------------------------- 1 | import json 2 | import random 3 | from typing import Callable 4 | 5 | from rltk.blocking.block_generator import BlockGenerator 6 | from rltk.blocking.block import Block 7 | from rltk.blocking.block_black_list import BlockBlackList 8 | 9 | 10 | class CanopyBlockGenerator(BlockGenerator): 11 | """ 12 | Canopy based block generator. 13 | 14 | Args: 15 | t1 (float): The loose distance. 16 | t2 (float): The tight distance. 17 | distance_metric (Callable): Compute the distance between two vectors return from :meth:`block`. 18 | The signature is `distance(v1: List, v2: List) -> float` 19 | """ 20 | def __init__(self, t1, t2, distance_metric): 21 | if t1 <= t2: 22 | raise ValueError('t1 should be greater than t2') 23 | if t2 <= 0: 24 | raise ValueError('t1 and t2 should greater than 0') 25 | 26 | self._t1 = t1 27 | self._t2 = t2 28 | self._distance_metric = distance_metric 29 | 30 | def block(self, dataset, function_: Callable = None, property_: str = None, 31 | block: Block = None, block_black_list: BlockBlackList = None, base_on: Block = None): 32 | """ 33 | The return of `property_` or `function_` should be a vector (list). 34 | """ 35 | block = super()._block_args_check(function_, property_, block) 36 | 37 | if base_on: 38 | raise Exception('Canopy currently doesn\'t support `base_on`') 39 | # for block_id, dataset_id, record_id in base_on: 40 | # if dataset.id == dataset_id: 41 | # r = dataset.get_record(record_id) 42 | # value = function_(r) if function_ else getattr(r, property_) 43 | # if not isinstance(value, list): 44 | # raise ValueError('Return of the function or property should be a vector (list)') 45 | # value = block_id + '-' + value 46 | # k = self._encode_key(value) 47 | # if block_black_list and block_black_list.has(k): 48 | # continue 49 | # block.add(k, dataset.id, r.id) 50 | # if block_black_list: 51 | # block_black_list.add(k, block) 52 | 53 | else: 54 | for r in dataset: 55 | value = function_(r) if function_ else getattr(r, property_) 56 | if not isinstance(value, list): 57 | raise ValueError('Return of the function or property should be a vector (list)') 58 | k = self._encode_key(value) 59 | if block_black_list and block_black_list.has(k): 60 | continue 61 | block.add(k, dataset.id, r.id) 62 | if block_black_list: 63 | block_black_list.add(k, block) 64 | 65 | return block 66 | 67 | @staticmethod 68 | def _encode_key(obj): 69 | return str(obj) 70 | 71 | @staticmethod 72 | def _decode_key(str_): 73 | return eval(str_) 74 | 75 | def generate(self, block1: Block, block2: Block, output_block: Block = None): 76 | output_block = BlockGenerator._generate_args_check(output_block) 77 | dataset = [] 78 | for key, _ in block1.key_set_adapter: 79 | dataset.append(self._decode_key(key)) 80 | for key, _ in block2.key_set_adapter: 81 | dataset.append(self._decode_key(key)) 82 | 83 | clusters = self._run_canopy_clustering(dataset, self._t1, self._t2, self._distance_metric) 84 | 85 | for cid, c in enumerate(clusters): 86 | for vec in c: 87 | key = self._encode_key(vec) 88 | set_ = block1.get(key) 89 | if set_: 90 | for ds_id, rid in set_: 91 | output_block.add(cid, ds_id, rid) 92 | set_ = block2.get(key) 93 | if set_: 94 | for ds_id, rid in set_: 95 | output_block.add(cid, ds_id, rid) 96 | return output_block 97 | 98 | @staticmethod 99 | def _run_canopy_clustering(dataset, t1, t2, distance_metric): 100 | """ 101 | The algorithm proceeds as follows, using two thresholds t1 (the loose distance) and t2 (the tight distance), 102 | where t1 > t2. 103 | 104 | 1. Begin with the set of data points to be clustered. 105 | 2. Remove a point from the set, beginning a new 'canopy' containing this point. 106 | 3. For each point left in the set, assign it to the new canopy \ 107 | if its distance to the first point of the canopy is less than the loose distance t1. 108 | 4. If the distance of the point is additionally less than the tight distance t2, 109 | remove it from the original set. 110 | 5. Repeat from step 2 until there are no more data points in the set to cluster. 111 | """ 112 | canopies = [] 113 | while len(dataset) > 0: 114 | center_idx = random.randint(0, len(dataset) - 1) 115 | center_vec = dataset[center_idx] 116 | new_canopy = [] 117 | delete_list = [] 118 | del dataset[center_idx] 119 | 120 | for d_idx in range(len(dataset)): 121 | d = dataset[d_idx] 122 | distance = distance_metric(center_vec, d) 123 | if distance < t1: 124 | new_canopy.append(d) 125 | if distance < t2: 126 | delete_list.append(d_idx) 127 | 128 | # delete vector from dataset from backward 129 | for d_idx in sorted(delete_list, reverse=True): 130 | del dataset[d_idx] 131 | new_canopy.append(center_vec) # add center 132 | canopies.append(new_canopy) 133 | return canopies 134 | -------------------------------------------------------------------------------- /rltk/blocking/hash_block_generator.py: -------------------------------------------------------------------------------- 1 | from typing import Callable, TYPE_CHECKING 2 | 3 | if TYPE_CHECKING: 4 | from rltk.dataset import Dataset 5 | from rltk.blocking.block_generator import BlockGenerator 6 | from rltk.blocking.block import Block 7 | from rltk.blocking.block_black_list import BlockBlackList 8 | 9 | 10 | class HashBlockGenerator(BlockGenerator): 11 | """ 12 | Hash block generator. 13 | """ 14 | 15 | def block(self, dataset, function_: Callable = None, property_: str = None, 16 | block: Block = None, block_black_list: BlockBlackList = None, base_on: Block = None): 17 | """ 18 | The return of `property_` or `function_` should be string. 19 | """ 20 | block = super()._block_args_check(function_, property_, block) 21 | 22 | if base_on: 23 | for block_id, dataset_id, record_id in base_on: 24 | if dataset.id == dataset_id: 25 | r = dataset.get_record(record_id) 26 | value = function_(r) if function_ else getattr(r, property_) 27 | if not isinstance(value, str): 28 | raise ValueError('Return of the function or property should be a string') 29 | value = block_id + '-' + value 30 | if block_black_list and block_black_list.has(value): 31 | continue 32 | block.add(value, dataset.id, r.id) 33 | if block_black_list: 34 | block_black_list.add(value, block) 35 | 36 | else: 37 | for r in dataset: 38 | value = function_(r) if function_ else getattr(r, property_) 39 | if not isinstance(value, str): 40 | raise ValueError('Return of the function or property should be a string') 41 | if block_black_list and block_black_list.has(value): 42 | continue 43 | block.add(value, dataset.id, r.id) 44 | if block_black_list: 45 | block_black_list.add(value, block) 46 | 47 | return block 48 | 49 | def generate(self, block1: Block, block2: Block, output_block: Block = None): 50 | output_block = super()._generate_args_check(output_block) 51 | for block_id, ds_id, record_id in block1: 52 | output_block.add(block_id, ds_id, record_id) 53 | for block_id, ds_id, record_id in block2: 54 | output_block.add(block_id, ds_id, record_id) 55 | return output_block 56 | -------------------------------------------------------------------------------- /rltk/blocking/sorted_neighbourhood_block_generator.py: -------------------------------------------------------------------------------- 1 | from typing import Callable 2 | from functools import cmp_to_key 3 | 4 | from rltk.blocking.block_generator import BlockGenerator 5 | from rltk.blocking.block import Block 6 | from rltk.blocking.block_black_list import BlockBlackList 7 | 8 | 9 | class SortedNeighbourhoodBlockGenerator(BlockGenerator): 10 | """ 11 | Sorted Neighbourhood Blocker. 12 | 13 | Args: 14 | window_size (int, optional): Window size. Defaults to 3. 15 | comparator (Callable, optional): Define how to compare two tokens t1 and t2. 16 | The signature is `comparator(t1: str, t2: str) -> int`. 17 | If return is 0, t1 equals t2; if return is -1, t1 is less than t2; 18 | if return is 1, t1 is greater than t2. 19 | Defaults to None, which uses Python's default string comparison. 20 | block_id_prefix (str, optional): The block id prefix of each block. 21 | Defaults to "sorted_neighbourhood_". 22 | """ 23 | def __init__(self, window_size: int = 3, comparator: Callable = None, block_id_prefix='sorted_neighbourhood_'): 24 | if comparator is None: 25 | comparator = self._default_comparator 26 | self.window_size = window_size 27 | self.comparator = comparator 28 | self.block_id_prefix = block_id_prefix 29 | 30 | def block(self, dataset, function_: Callable = None, property_: str = None, 31 | block: Block = None, block_black_list: BlockBlackList = None, base_on: Block = None): 32 | """ 33 | The return of `property_` or `function_` should be a vector (list). 34 | """ 35 | block = super()._block_args_check(function_, property_, block) 36 | 37 | if base_on: 38 | for block_id, dataset_id, record_id in base_on: 39 | if dataset.id == dataset_id: 40 | r = dataset.get_record(record_id) 41 | value = function_(r) if function_ else getattr(r, property_) 42 | if not isinstance(value, (list, set)): 43 | value = value(set) 44 | for v in value: 45 | if not isinstance(v, str): 46 | raise ValueError('Elements in return list should be string') 47 | if block_black_list and block_black_list.has(v): 48 | continue 49 | v = block_id + '-' + v 50 | block.add(v, dataset.id, r.id) 51 | if block_black_list: 52 | block_black_list.add(v, block) 53 | 54 | else: 55 | for r in dataset: 56 | value = function_(r) if function_ else getattr(r, property_) 57 | if not isinstance(value, (list, set)): 58 | value = set(value) 59 | for v in value: 60 | if not isinstance(v, str): 61 | raise ValueError('Elements in return list should be string') 62 | if block_black_list and block_black_list.has(v): 63 | continue 64 | block.add(v, dataset.id, r.id) 65 | if block_black_list: 66 | block_black_list.add(v, block) 67 | 68 | return block 69 | 70 | def generate(self, block1: Block, block2: Block, output_block: Block = None): 71 | output_block = BlockGenerator._generate_args_check(output_block) 72 | 73 | # TODO: in-memory operations here, need to update 74 | # concatenation 75 | all_records = [] 76 | for block_id, ds_id, record_id in block1: 77 | all_records.append((block_id, ds_id, record_id)) 78 | for block_id, ds_id, record_id in block2: 79 | all_records.append((block_id, ds_id, record_id)) 80 | sorted_all_records = sorted(all_records, key=cmp_to_key(self._comparator_wrapper)) 81 | 82 | # apply slide window 83 | for i in range(len(sorted_all_records) - self.window_size + 1): 84 | block_id = self.block_id_prefix + str(i) 85 | for j in range(self.window_size): 86 | record = sorted_all_records[i + j] 87 | output_block.add(block_id, record[1], record[2]) 88 | 89 | return output_block 90 | 91 | def _comparator_wrapper(self, t1, t2): 92 | return self.comparator(t1[0], t2[0]) 93 | 94 | @staticmethod 95 | def _default_comparator(t1, t2): 96 | return 0 if t1 == t2 else (1 if t1 > t2 else -1) 97 | -------------------------------------------------------------------------------- /rltk/blocking/token_block_generator.py: -------------------------------------------------------------------------------- 1 | from typing import Callable, TYPE_CHECKING 2 | 3 | if TYPE_CHECKING: 4 | from rltk.dataset import Dataset 5 | from rltk.blocking.block_generator import BlockGenerator 6 | from rltk.blocking.block import Block 7 | from rltk.blocking.block_black_list import BlockBlackList 8 | 9 | 10 | class TokenBlockGenerator(BlockGenerator): 11 | """ 12 | Token block generator. The return for :meth:`block` should be a `list` or `set`. 13 | """ 14 | 15 | def block(self, dataset, function_: Callable = None, property_: str = None, 16 | block: Block = None, block_black_list: BlockBlackList = None, base_on: Block = None): 17 | """ 18 | The return of `property_` or `function_` should be list or set. 19 | """ 20 | block = super()._block_args_check(function_, property_, block) 21 | 22 | if base_on: 23 | for block_id, dataset_id, record_id in base_on: 24 | if dataset.id == dataset_id: 25 | r = dataset.get_record(record_id) 26 | value = function_(r) if function_ else getattr(r, property_) 27 | if not isinstance(value, list) and not isinstance(value, set): 28 | raise ValueError('Return of the function or property should be a list') 29 | for v in value: 30 | if not isinstance(v, str): 31 | raise ValueError('Elements in return list should be string') 32 | if block_black_list and block_black_list.has(v): 33 | continue 34 | v = block_id + '-' + v 35 | block.add(v, dataset.id, r.id) 36 | if block_black_list: 37 | block_black_list.add(v, block) 38 | 39 | else: 40 | for r in dataset: 41 | value = function_(r) if function_ else getattr(r, property_) 42 | if not isinstance(value, list) and not isinstance(value, set): 43 | raise ValueError('Return of the function or property should be a list') 44 | for v in value: 45 | if not isinstance(v, str): 46 | raise ValueError('Elements in return list should be string') 47 | if block_black_list and block_black_list.has(v): 48 | continue 49 | block.add(v, dataset.id, r.id) 50 | if block_black_list: 51 | block_black_list.add(v, block) 52 | 53 | return block 54 | 55 | def generate(self, block1: Block, block2: Block, output_block: Block = None): 56 | output_block = super()._generate_args_check(output_block) 57 | for block_id, ds_id, record_id in block1: 58 | output_block.add(block_id, ds_id, record_id) 59 | for block_id, ds_id, record_id in block2: 60 | output_block.add(block_id, ds_id, record_id) 61 | return output_block 62 | -------------------------------------------------------------------------------- /rltk/cli.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | #: Accept all default values without asking user 5 | SLIENTLY_ACCEPT_ALL_DEFAULT_VALUES = False 6 | 7 | 8 | def prompt(text: str, *args, new_line: bool = True, **kwargs): 9 | """ 10 | Prompt in terminal (stdout). 11 | 12 | Args: 13 | text (str): Text. 14 | *args: More text. 15 | new_line (bool, optional): End with a new line. Defaults to True. 16 | **kwargs: Other key word arguments used by :py:meth:`print`. 17 | """ 18 | line_end = '\n' if new_line else '' 19 | print(text, *args, file=sys.stdout, end=line_end, **kwargs) 20 | sys.stdout.flush() 21 | 22 | 23 | def select(text: str, cases: list, default: int = None, case_sensitive: bool = False): 24 | """ 25 | Let user select one of the cases. 26 | 27 | Args: 28 | text (str): Text. 29 | cases (list[tuple]): Cases, should be list of tuples. Each tuple is in form `('display text', 'user's input')`. 30 | For example, `[('(Y)es, 'y'), ('(N)o', 'n')]`. 31 | default (int, optional): Default case index in `cases`. Empty or space is treated as default case. 32 | None means no default case. Defaults to None. 33 | case_sensitive (bool, optional): If user's input is case sensitive, defaults to False. 34 | 35 | Returns: 36 | str: User's input. 37 | """ 38 | prompt(text) 39 | case_text = [] 40 | for idx, c in enumerate(cases): 41 | if default is not None and idx == default: 42 | case_text.append('[{}]'.format(c[0])) 43 | else: 44 | case_text.append('{}'.format(c[0])) 45 | prompt(' / '.join(case_text)) 46 | valid_cases = [c[1] for c in cases] 47 | if default is not None: 48 | valid_cases.append('') 49 | if not case_sensitive: 50 | valid_cases = list(map(lambda x: x.lower(), valid_cases)) 51 | 52 | while True: 53 | user_input = '' 54 | if not SLIENTLY_ACCEPT_ALL_DEFAULT_VALUES or default is None: 55 | user_input = input().strip() 56 | if not case_sensitive: 57 | user_input = user_input.lower() 58 | if user_input not in valid_cases: 59 | prompt('Invalid input, please retry') 60 | continue 61 | 62 | if user_input == '' and default is not None: 63 | return cases[default][1] 64 | return user_input 65 | 66 | 67 | def confirm(text: str, default: bool = None): 68 | """ 69 | Let user choose Yes or No. 70 | 71 | Args: 72 | text (str): Text. 73 | default (bool, optional): True sets Yes as default case, False sets No. None means no default case. 74 | Defaults to None. 75 | 76 | Returns: 77 | bool: True means Yes, False means No. 78 | """ 79 | if default is not None: 80 | default = 0 if default else 1 81 | return select(text, cases=[('(Y)es', 'y',), ('(N)o', 'n')], default=default, case_sensitive=False) == 'y' 82 | 83 | 84 | class Progress(object): 85 | """ 86 | Progress status. 87 | 88 | Args: 89 | format_ (str, optional): Format of text. 90 | start (str, optional): Text while starting. 91 | end (str, optional): Text while ending. 92 | 93 | Note: 94 | 95 | Please use in `with` statement:: 96 | 97 | with rltk.cli.progress(format_='{}%') as p: 98 | for i in range(11): 99 | time.sleep(0.5) 100 | p.update(i * 10) 101 | 102 | """ 103 | 104 | def __init__(self, format_: str = '{}', start: str = 'Starting...', end: str = 'Done!'): 105 | self._format = format_ 106 | self._prev_len = 0 107 | self._start = start 108 | self._end = end 109 | 110 | def update(self, *args): 111 | """ 112 | Update progress. 113 | 114 | Args: 115 | *args: Arguments which will be formatted by `format_`. 116 | """ 117 | text = self._format.format(*args) 118 | 119 | # clean up 120 | prompt('\r' + ' ' * self._prev_len, new_line=False) 121 | 122 | # overwrite 123 | prompt('\r' + text, new_line=False) 124 | self._prev_len = len(text) 125 | 126 | def __enter__(self): 127 | """ 128 | Start prompt. 129 | """ 130 | if self._start: 131 | prompt(self._start, new_line=False) 132 | return self 133 | 134 | def __exit__(self, exc_type, exc_val, exc_tb): 135 | """ 136 | End prompt. 137 | """ 138 | # clean up 139 | prompt('\r' + ' ' * self._prev_len, new_line=False) 140 | 141 | if self._end: 142 | prompt('\r' + self._end, new_line=False) 143 | 144 | # new line 145 | prompt('') 146 | 147 | 148 | progress = Progress 149 | 150 | 151 | def input_(text: str, default: str = None, type_: type = str): 152 | """ 153 | Input. 154 | 155 | Args: 156 | text (str): Text. 157 | default (str, optional): Default value. Defaults to None which means no default value. 158 | type_ (type, optional): Type of input value, defaults to `str`. 159 | 160 | Returns: 161 | object: User input in type `type_`. 162 | 163 | Note: 164 | Make sure default value can be converted by `type_`, otherwise exception will be raised. 165 | """ 166 | prompt(text) 167 | 168 | while True: 169 | if not SLIENTLY_ACCEPT_ALL_DEFAULT_VALUES or default is None: 170 | user_input = input().strip() 171 | try: 172 | return type_(user_input) 173 | except: 174 | prompt('Invalid input, please retry') 175 | else: 176 | return type_(default) 177 | 178 | -------------------------------------------------------------------------------- /rltk/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | from rltk.evaluation.evaluation import Evaluation 2 | from rltk.evaluation.ground_truth import GroundTruth 3 | from rltk.evaluation.trial import Trial 4 | -------------------------------------------------------------------------------- /rltk/io/__init__.py: -------------------------------------------------------------------------------- 1 | from rltk.io.reader import * 2 | from rltk.io.writer import * 3 | from rltk.io.adapter import * 4 | -------------------------------------------------------------------------------- /rltk/io/adapter/__init__.py: -------------------------------------------------------------------------------- 1 | from rltk.io.adapter.key_value_adapter import KeyValueAdapter 2 | from rltk.io.adapter.memory_key_value_adapter import MemoryKeyValueAdapter 3 | from rltk.io.adapter.dbm_key_value_adapter import DbmKeyValueAdapter 4 | from rltk.io.adapter.redis_key_value_adapter import RedisKeyValueAdapter 5 | from rltk.io.adapter.hbase_key_value_adapter import HBaseKeyValueAdapter 6 | 7 | from rltk.io.adapter.key_set_adapter import KeySetAdapter 8 | from rltk.io.adapter.memory_key_set_adapter import MemoryKeySetAdapter 9 | from rltk.io.adapter.redis_key_set_adapter import RedisKeySetAdapter 10 | from rltk.io.adapter.leveldb_key_set_adapter import LevelDbKeySetAdapter 11 | -------------------------------------------------------------------------------- /rltk/io/adapter/dbm_key_value_adapter.py: -------------------------------------------------------------------------------- 1 | import dbm 2 | 3 | from rltk.io.adapter import KeyValueAdapter 4 | from rltk.io.serializer import Serializer, PickleSerializer 5 | 6 | 7 | class DbmKeyValueAdapter(KeyValueAdapter): 8 | """ 9 | Python builtin `DBM `_ adapter. 10 | 11 | Args: 12 | filename (str): DBM file name. 13 | dbm_class (dbm): The value can be `dbm.gnu`, `dbm.ndbm` or `dbm.dumb`. 14 | serializer (Serializer, optional): The serializer used to serialize Record object. 15 | If it's None, `PickleSerializer` will be used. Defaults to None. 16 | clean (bool, optional): Clean adapters while starting. Defaults to False. 17 | 18 | Note: 19 | Performance drops when dataset is large. 20 | """ 21 | def __init__(self, filename, dbm_class=dbm.ndbm, serializer: Serializer = None, clean: bool = False): 22 | if not serializer: 23 | serializer = PickleSerializer() 24 | self._db = dbm_class.open(filename, 'c') 25 | self._serializer = serializer 26 | 27 | if clean: 28 | self.clean() 29 | 30 | def get(self, key): 31 | v = self._db.get(key, None) 32 | if not v: 33 | return 34 | return self._serializer.loads(v) 35 | 36 | def set(self, key, value): 37 | self._db[key] = self._serializer.dumps(value) 38 | 39 | def __next__(self): 40 | for k in self._db.keys(): 41 | key = k.decode('utf-8') 42 | yield key, self.get(key) 43 | 44 | def delete(self, key): 45 | del self._db[key] 46 | 47 | def close(self): 48 | self._db.close() 49 | -------------------------------------------------------------------------------- /rltk/io/adapter/hbase_key_value_adapter.py: -------------------------------------------------------------------------------- 1 | from rltk.record import Record 2 | from rltk.io.adapter import KeyValueAdapter 3 | from rltk.io.serializer import Serializer, PickleSerializer 4 | from rltk.utils import module_importer 5 | 6 | 7 | happybase = module_importer('happybase', 'happybase>=1.1.0') 8 | 9 | 10 | class HBaseKeyValueAdapter(KeyValueAdapter): 11 | """ 12 | Hbase Adapter. 13 | 14 | Args: 15 | host (str): Host address. 16 | table (str): HBase table name. 17 | serializer (Serializer, optional): The serializer used to serialize Record object. 18 | If it's None, `PickleSerializer` will be used. Defaults to None. 19 | key_prefix (str, optional): The prefix of HBase row key. 20 | clean (bool, optional): Clean adapters while starting. Defaults to False. 21 | **kwargs: Other parameters used by `happybase.Connection `_ . 22 | 23 | Note: 24 | The timeout of thrift in hbase-site.xml needs to increase:: 25 | 26 | 27 | hbase.thrift.server.socket.read.timeout 28 | 6000000 29 | 30 | 31 | hbase.thrift.connection.max-idletime 32 | 18000000 33 | 34 | """ 35 | 36 | def __init__(self, host, table, serializer: Serializer = None, key_prefix: str = '', clean: bool = False, **kwargs): 37 | if not serializer: 38 | serializer = PickleSerializer() 39 | self._conn = happybase().Connection(host=host, timeout=None, **kwargs) 40 | self._serializer = serializer 41 | self._key_prefix = key_prefix 42 | self._family_name = 'rltk' 43 | self._col_name = 'obj' 44 | self._fam_col_name = '{}:{}'.format(self._family_name, self._col_name).encode('utf-8') 45 | 46 | if table.encode('utf-8') not in self._conn.tables(): 47 | self._create_table(table) 48 | self._table = self._conn.table(table) 49 | 50 | if clean: 51 | self.clean() 52 | 53 | #: parallel-safe 54 | parallel_safe = True 55 | 56 | def _encode_key(self, key): 57 | happybase.Connection('asd') 58 | return '{prefix}{key}'.format(prefix=self._key_prefix, key=key).encode('utf-8') 59 | 60 | def _decode_key(self, key): 61 | key = key.decode('utf-8') 62 | return key[len(self._key_prefix):] 63 | 64 | def close(self): 65 | try: 66 | self._conn.close() 67 | except: 68 | pass 69 | 70 | def _create_table(self, table_name): 71 | self._conn.create_table(table_name, {self._family_name: dict()}) 72 | 73 | def get(self, key) -> object: 74 | return self._serializer.loads(self._table.row(self._encode_key(key))[self._fam_col_name]) 75 | 76 | def set(self, key, value: object): 77 | return self._table.put(self._encode_key(key), {self._fam_col_name: self._serializer.dumps(value)}) 78 | 79 | def delete(self, key): 80 | return self._table.delete(self._encode_key(key)) 81 | 82 | def __next__(self): 83 | for key, data in self._table.scan( 84 | row_prefix=self._key_prefix.encode('utf-8'), filter=b'FirstKeyOnlyFilter()'): 85 | yield self._decode_key(key), self._serializer.loads(data[self._fam_col_name]) 86 | -------------------------------------------------------------------------------- /rltk/io/adapter/key_set_adapter.py: -------------------------------------------------------------------------------- 1 | import json 2 | import io 3 | 4 | 5 | class KeySetAdapter(object): 6 | """ 7 | Key Set Adapter. 8 | """ 9 | 10 | def get(self, key: str): 11 | """ 12 | Get value by key. 13 | 14 | Args: 15 | key (str): Key. 16 | 17 | Returns: 18 | set: A set of values, None if key doesn't exist. 19 | """ 20 | raise NotImplementedError 21 | 22 | def set(self, key: str, value: set): 23 | """ 24 | Set a set by key. 25 | 26 | Args: 27 | key (str): Key. 28 | value (builtins.set): Value set. 29 | """ 30 | raise NotImplementedError 31 | 32 | def add(self, key: str, value: object): 33 | """ 34 | Add value to a set by key. If key doesn't exist, create one. 35 | 36 | Args: 37 | key (str): Key. 38 | value (object): Value. 39 | """ 40 | raise NotImplementedError 41 | 42 | def remove(self, key: str, value: object): 43 | """ 44 | Remove value from a set by key. If key doesn't exist, create one. 45 | 46 | Args: 47 | key (str): Key. 48 | value (object): Value. 49 | """ 50 | raise NotImplementedError 51 | 52 | def delete(self, key: str): 53 | """ 54 | Delete a set by key. 55 | 56 | Args: 57 | key (str): Key. 58 | """ 59 | raise NotImplementedError 60 | 61 | def dump(self, f: io.IOBase): 62 | """ 63 | Dump data to json lines format. Each json object is formatted as `{key: [value1, value2, ...]}`. 64 | 65 | Args: 66 | f (io.IOBase): IO handler. 67 | """ 68 | for k, ss in self: 69 | obj = {k: list(ss)} 70 | f.write(json.dumps(obj) + '\n') 71 | 72 | def clean(self): 73 | """ 74 | Delete all keys in this adapter. 75 | """ 76 | for k, _ in self: 77 | self.delete(k) 78 | 79 | def __init__(self): 80 | pass 81 | 82 | def __del__(self): 83 | """ 84 | Same as :meth:`close`. 85 | """ 86 | self.close() 87 | 88 | def __iter__(self): 89 | """ 90 | Same as :meth:`__next__`. 91 | """ 92 | return self.__next__() 93 | 94 | def __next__(self): 95 | """ 96 | Iterator of the data store. This is not required. 97 | 98 | Returns: 99 | iter: key, set 100 | """ 101 | pass 102 | 103 | def close(self): 104 | """ 105 | Close handler if needed. 106 | """ 107 | pass 108 | -------------------------------------------------------------------------------- /rltk/io/adapter/key_value_adapter.py: -------------------------------------------------------------------------------- 1 | class KeyValueAdapter(object): 2 | """ 3 | Super class of adapter of key value stores. 4 | """ 5 | def __init__(self): 6 | pass 7 | 8 | def __del__(self): 9 | """ 10 | Same as :meth:`close`. 11 | """ 12 | self.close() 13 | 14 | #: If this adapter is parallel-safe. Defaults to False if it's not overwritten in concrete class. 15 | parallel_safe = False 16 | 17 | def get(self, key: str) -> object: 18 | """ 19 | Get value. 20 | 21 | Args: 22 | key (str): Key. 23 | 24 | Returns: 25 | object: 26 | """ 27 | raise NotImplementedError 28 | 29 | def set(self, key: str, value: object): 30 | """ 31 | Set value. 32 | 33 | Args: 34 | key (str): Key. 35 | value (object): Value. 36 | """ 37 | raise NotImplementedError 38 | 39 | def delete(self, key): 40 | """ 41 | Delete value. 42 | 43 | Args: 44 | key (str): Key. 45 | """ 46 | raise NotImplementedError 47 | 48 | def clean(self): 49 | """ 50 | Delete all keys in adapter. 51 | """ 52 | for key, _ in self: 53 | self.delete(key) 54 | 55 | def __iter__(self): 56 | """ 57 | Same as :meth:`__next__`. 58 | """ 59 | return self.__next__() 60 | 61 | def __next__(self): 62 | """ 63 | Iterator of the data store. This is not required. 64 | 65 | Returns: 66 | iter: key, value 67 | """ 68 | pass 69 | 70 | def close(self): 71 | """ 72 | Close handler if needed. 73 | """ 74 | pass -------------------------------------------------------------------------------- /rltk/io/adapter/leveldb_key_set_adapter.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from rltk.io.serializer import Serializer, PickleSerializer 4 | from rltk.io.adapter.key_set_adapter import KeySetAdapter 5 | from rltk.utils import module_importer 6 | 7 | 8 | plyvel = module_importer('plyvel', 'plyvel>=1.0.5', ''' 9 | Please install LevelDB's system level package first: https://github.com/google/leveldb . 10 | 11 | If you are using Mac and installed LevelDB by HomeBrew, 12 | please make sure that `plyvel` refers to correct library file while installing: 13 | 14 | pip uninstall plyvel 15 | CFLAGS='-mmacosx-version-min=10.7 -stdlib=libc++' pip install --no-cache-dir plyvel 16 | ''') 17 | 18 | 19 | class LevelDbKeySetAdapter(KeySetAdapter): 20 | """ 21 | `LevelDB `_ key set adapter. 22 | LevelDB is a serverless, stand-alone key value store. It can be used as a local file system store. 23 | 24 | 25 | Args: 26 | path (str): The directory path used by LevelDB. 27 | name (str): Because LevelDB only has a single key space, \ 28 | this is used as name space. 29 | serializer (Serializer, optional): The serializer used to serialize each object in set. 30 | If it's None, `PickleSerializer` will be used. Defaults to None. 31 | clean (bool, optional): Clean adapters while starting. Defaults to False. 32 | kwargs: Other key word arguments for `plyvel.DB `_. 33 | 34 | Note: 35 | A particular LevelDB database only supports accessing by one process at one time. 36 | This adapter uses singleton (in one RLTK instance) to make sure only one `plyvel.DB` is created. 37 | Different `name` s can be used if you don't want to create multiple databases. 38 | """ 39 | _db_instance = None 40 | _db_ref_count = 0 41 | 42 | def __init__(self, path: str, name: str, serializer: Serializer = None, clean: bool = False, **kwargs): 43 | if not serializer: 44 | serializer = PickleSerializer() 45 | 46 | # leveldb's connection can only be a singleton 47 | if not self.__class__._db_instance: 48 | if not os.path.exists(path): 49 | os.mkdir(path) 50 | self.__class__._db_instance = plyvel().DB(path, create_if_missing=True, **kwargs) 51 | self._db = self.__class__._db_instance 52 | self.__class__._db_ref_count += 1 53 | 54 | self._prefix = '{name}_'.format(name=name) 55 | self._prefix_db = self._db.prefixed_db(self._encode(self._prefix)) 56 | self._serializer = serializer 57 | 58 | if clean: 59 | self.clean() 60 | 61 | @staticmethod 62 | def _encode(string): 63 | return string.encode(encoding='utf-8') 64 | 65 | @staticmethod 66 | def _decode(bytes_): 67 | return bytes_.decode(encoding='utf-8') 68 | 69 | def _get(self, key): 70 | v = self._prefix_db.get(key) 71 | if not v: 72 | return 73 | return self._serializer.loads(v) 74 | 75 | def get(self, key): 76 | return self._get(self._encode(key)) 77 | 78 | def set(self, key, value): 79 | if not isinstance(value, set): 80 | raise ValueError('value must be a set') 81 | self.delete(key) 82 | self._prefix_db.put(self._encode(key), self._serializer.dumps(value)) 83 | 84 | def add(self, key, value): 85 | set_ = self.get(key) 86 | if not set_: 87 | set_ = set([]) 88 | set_.add(value) 89 | return self.set(key, set_) 90 | 91 | def remove(self, key, value): 92 | set_ = self.get(key) 93 | if not set_: 94 | return 95 | set_.remove(value) 96 | return self.set(key, set_) 97 | 98 | def delete(self, key): 99 | return self._prefix_db.delete(self._encode(key)) 100 | 101 | def __next__(self): 102 | for key in self._prefix_db.iterator(include_value=False): 103 | yield self._decode(key), self._get(key) 104 | 105 | def close(self): 106 | self.__class__._db_ref_count -= 1 107 | if self.__class__._db_ref_count == 0: 108 | self._db.close() 109 | -------------------------------------------------------------------------------- /rltk/io/adapter/memory_key_set_adapter.py: -------------------------------------------------------------------------------- 1 | from rltk.io.adapter.key_set_adapter import KeySetAdapter 2 | 3 | 4 | class MemoryKeySetAdapter(KeySetAdapter): 5 | """ 6 | Memory key set a adapter. 7 | """ 8 | 9 | def __init__(self): 10 | self._store = dict() 11 | 12 | def get(self, key): 13 | return self._store.get(key) 14 | 15 | def set(self, key, value): 16 | if not isinstance(value, set): 17 | raise ValueError('value must be a set') 18 | self._store[key] = value 19 | 20 | def add(self, key, value): 21 | if key not in self._store: 22 | self._store[key] = set() 23 | self._store[key].add(value) 24 | 25 | def remove(self, key, value): 26 | self._store[key].remove(value) 27 | 28 | def delete(self, key): 29 | del self._store[key] 30 | 31 | def clean(self): 32 | self._store = dict() 33 | 34 | def __next__(self): 35 | for k, v in self._store.items(): 36 | yield k, v 37 | -------------------------------------------------------------------------------- /rltk/io/adapter/memory_key_value_adapter.py: -------------------------------------------------------------------------------- 1 | from rltk.io.adapter import KeyValueAdapter 2 | 3 | 4 | class MemoryKeyValueAdapter(KeyValueAdapter): 5 | """ 6 | Basic in-memory (dict) adapter. 7 | """ 8 | def __init__(self): 9 | self._dict = dict() 10 | 11 | def get(self, key): 12 | return self._dict.get(key) 13 | 14 | def set(self, key, value: object): 15 | self._dict[key] = value 16 | 17 | def __next__(self): 18 | for key, value in self._dict.items(): 19 | yield key, value 20 | 21 | def delete(self, key): 22 | del self._dict[key] 23 | 24 | def clean(self): 25 | self._dict = dict() 26 | -------------------------------------------------------------------------------- /rltk/io/adapter/redis_key_set_adapter.py: -------------------------------------------------------------------------------- 1 | from rltk.io.serializer import Serializer, PickleSerializer 2 | from rltk.io.adapter.key_set_adapter import KeySetAdapter 3 | from rltk.utils import module_importer 4 | 5 | 6 | redis = module_importer('redis', 'redis>=2.0.0') 7 | 8 | 9 | class RedisKeySetAdapter(KeySetAdapter): 10 | """ 11 | Redis key set adapter. 12 | 13 | Args: 14 | host (str): Host address. 15 | serializer (Serializer, optional): The serializer used to serialize Record object. 16 | If it's None, `PickleSerializer` will be used. Defaults to None. 17 | key_prefix (str, optional): Prefix of key in redis. Defaults to empty string. 18 | clean (bool, optional): Clean adapters while starting. Defaults to False. 19 | **kwargs: Other parameters used by `redis.Redis `_ . 20 | """ 21 | 22 | def __init__(self, host, key_prefix: str = '', serializer: Serializer=None, clean: bool = False, **kwargs): 23 | if not serializer: 24 | serializer = PickleSerializer() 25 | self._redis = redis().Redis(host=host, **kwargs) 26 | self._serializer = serializer 27 | self._key_prefix = key_prefix 28 | 29 | if clean: 30 | self.clean() 31 | 32 | def _encode_key(self, key): 33 | return '{prefix}{key}'.format(prefix=self._key_prefix, key=key) 34 | 35 | def _decode_key(self, key): 36 | key = key.decode('utf-8') 37 | return key[len(self._key_prefix):] 38 | 39 | def get(self, key): 40 | return self._get(self._encode_key(key)) 41 | 42 | def _get(self, key): 43 | v = set([self._serializer.loads(v) for v in self._redis.smembers(key)]) 44 | if len(v) != 0: 45 | return v 46 | 47 | def set(self, key, value): 48 | if not isinstance(value, set): 49 | raise ValueError('value must be a set') 50 | self.delete(key) 51 | for v in value: 52 | self.add(key, v) 53 | 54 | def add(self, key, value): 55 | return self._redis.sadd(self._encode_key(key), self._serializer.dumps(value)) 56 | 57 | def remove(self, key, value): 58 | return self._redis.srem(self._encode_key(key), self._serializer.dumps(value)) 59 | 60 | def delete(self, key): 61 | return self._redis.delete(self._encode_key(key)) 62 | 63 | def __next__(self): 64 | # scan_iter() returns generator, keys() returns array 65 | for key in self._redis.scan_iter(self._encode_key('*')): 66 | yield self._decode_key(key), self._get(key) 67 | -------------------------------------------------------------------------------- /rltk/io/adapter/redis_key_value_adapter.py: -------------------------------------------------------------------------------- 1 | from rltk.record import Record 2 | from rltk.io.adapter import KeyValueAdapter 3 | from rltk.io.serializer import Serializer, PickleSerializer 4 | from rltk.utils import module_importer 5 | 6 | 7 | redis = module_importer('redis', 'redis>=2.0.0') 8 | 9 | 10 | class RedisKeyValueAdapter(KeyValueAdapter): 11 | """ 12 | Redis adapter. 13 | 14 | Args: 15 | host (str): Host address. 16 | serializer (Serializer, optional): The serializer used to serialize Record object. 17 | If it's None, `PickleSerializer` will be used. Defaults to None. 18 | key_prefix (str, optional): Prefix of key in redis. Defaults to empty string. 19 | clean (bool, optional): Clean adapters while starting. Defaults to False. 20 | **kwargs: Other parameters used by `redis.Redis `_ . 21 | """ 22 | def __init__(self, host, serializer: Serializer=None, key_prefix: str = '', clean: bool = False, **kwargs): 23 | if not serializer: 24 | serializer = PickleSerializer() 25 | self._redis = redis().Redis(host=host, **kwargs) 26 | self._serializer = serializer 27 | self._key_prefix = key_prefix 28 | 29 | if clean: 30 | self.clean() 31 | 32 | #: parallel-safe 33 | parallel_safe = True 34 | 35 | def _encode_key(self, key): 36 | return self._key_prefix + key 37 | 38 | def _decode_key(self, key): 39 | key = key.decode('utf-8') 40 | return key[len(self._key_prefix):] 41 | 42 | def get(self, key) -> object: 43 | v = self._redis.get(self._encode_key(key)) 44 | if v: 45 | return self._serializer.loads(v) 46 | 47 | def set(self, key, value: object): 48 | return self._redis.set(self._encode_key(key), self._serializer.dumps(value)) 49 | 50 | def delete(self, key): 51 | return self._redis.delete(self._encode_key(key)) 52 | 53 | def __next__(self): 54 | # scan_iter() returns generator, keys() returns array 55 | for key in self._redis.scan_iter(self._encode_key('*')): 56 | yield self._decode_key(key), self._serializer.loads(self._redis.get(key)) 57 | -------------------------------------------------------------------------------- /rltk/io/io_utils.py: -------------------------------------------------------------------------------- 1 | import io 2 | 3 | 4 | def get_file_handler(f, mode='r'): 5 | """ 6 | Helper function for getting file handler. 7 | 8 | Args: 9 | f (Union[str,io.IOBase]): File path or handler. 10 | mode (str, optional): Parameter mode in :py:meth:`open`. Defaults to `r`. 11 | 12 | Returns: 13 | io.IOBase: File handler. 14 | """ 15 | if isinstance(f, io.IOBase): 16 | return f 17 | 18 | return open(f, mode) 19 | -------------------------------------------------------------------------------- /rltk/io/reader/__init__.py: -------------------------------------------------------------------------------- 1 | from rltk.io.reader.reader import Reader 2 | from rltk.io.reader.array_reader import ArrayReader 3 | from rltk.io.reader.dataframe_reader import DataFrameReader 4 | from rltk.io.reader.csv_reader import CSVReader 5 | from rltk.io.reader.jsonlines_reader import JsonLinesReader 6 | from rltk.io.reader.ground_truth_reader import GroundTruthReader 7 | -------------------------------------------------------------------------------- /rltk/io/reader/array_reader.py: -------------------------------------------------------------------------------- 1 | from rltk.io.reader import Reader 2 | 3 | 4 | class ArrayReader(Reader): 5 | """ 6 | Array Reader. 7 | 8 | Args: 9 | array (list): Array. 10 | """ 11 | 12 | def __init__(self, array): 13 | try: 14 | for _ in array: 15 | break 16 | except TypeError: 17 | raise TypeError('Can not iterate on ArrayReader') 18 | 19 | self._array = array 20 | 21 | def __next__(self): 22 | for item in self._array: 23 | yield item 24 | 25 | -------------------------------------------------------------------------------- /rltk/io/reader/csv_reader.py: -------------------------------------------------------------------------------- 1 | import csv 2 | 3 | from rltk.io.reader import Reader 4 | from rltk.io.io_utils import get_file_handler 5 | 6 | 7 | class CSVReader(Reader): 8 | """ 9 | CSV reader. 10 | 11 | Args: 12 | file_handler (str/io.IOBase): File name or file handler of input file. 13 | **kwargs: Other parameters used by `csv.DictReader `_ . 14 | """ 15 | 16 | def __init__(self, file_handler, **kwargs): 17 | self._file_handler = get_file_handler(file_handler) 18 | self._csv_reader = csv.DictReader(self._file_handler, **kwargs) 19 | 20 | def __next__(self): 21 | for obj in self._csv_reader: 22 | yield {t[0]: t[1] for t in obj.items()} 23 | 24 | def close(self): 25 | try: 26 | self._file_handler.close() 27 | except: 28 | pass 29 | -------------------------------------------------------------------------------- /rltk/io/reader/dataframe_reader.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | from rltk.io.reader import Reader 4 | 5 | 6 | class DataFrameReader(Reader): 7 | """ 8 | Pandas DataFrame Reader. 9 | 10 | Args: 11 | df (pandas.DataFrame): DataFrame. 12 | keep_dataframe_default_index (bool, optional): If True, add a key `dataframe_default_index` holding the \ 13 | original index in Dataframe. Defaults to False. 14 | """ 15 | 16 | def __init__(self, df: pd.DataFrame, keep_dataframe_default_index: bool = False): 17 | self._df = df 18 | self._keep_dataframe_default_index = keep_dataframe_default_index 19 | 20 | def __next__(self): 21 | if self._keep_dataframe_default_index: 22 | for i, item in self._df.iterrows(): 23 | yield dict(item.to_dict(), dataframe_default_index=i) 24 | else: 25 | for _, item in self._df.iterrows(): 26 | yield item.to_dict() 27 | -------------------------------------------------------------------------------- /rltk/io/reader/ground_truth_reader.py: -------------------------------------------------------------------------------- 1 | import csv 2 | 3 | from rltk.io.reader import Reader 4 | from rltk.io.io_utils import get_file_handler 5 | 6 | 7 | class GroundTruthReader(Reader): 8 | """ 9 | Ground truth reader. Currently, ground truth stores in CSV format. 10 | 11 | Args: 12 | file_handler (str/io.IOBase): File name or file handler of input file. 13 | **kwargs: Other parameters used by `csv.DictReader `_ . 14 | """ 15 | 16 | def __init__(self, file_handler, **kwargs): 17 | self._file_handler = get_file_handler(file_handler) 18 | self._csv_reader = csv.DictReader(self._file_handler, **kwargs) 19 | 20 | def __next__(self): 21 | for obj in self._csv_reader: 22 | yield {t[0]: t[1] for t in obj.items()} 23 | 24 | def close(self): 25 | try: 26 | self._file_handler.close() 27 | except: 28 | pass 29 | -------------------------------------------------------------------------------- /rltk/io/reader/jsonlines_reader.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | 4 | from rltk.io.reader import Reader 5 | from rltk.io.io_utils import get_file_handler 6 | 7 | 8 | class JsonLinesReader(Reader): 9 | """ 10 | `JSON Lines `_ Reader. 11 | 12 | Args: 13 | file_handler (str/io.IOBase): File name or file handler of input file. 14 | ignore_blank_line (bool): If blank line should be ignored. Defaults to True. 15 | """ 16 | 17 | def __init__(self, file_handler, ignore_blank_line=True): 18 | self._file_handler = get_file_handler(file_handler) 19 | self._ignore_blank_line = ignore_blank_line 20 | 21 | def __next__(self): 22 | for line in self._file_handler: 23 | if line.strip() == '': 24 | if self._ignore_blank_line: 25 | continue 26 | else: 27 | raise ValueError('Blank line detected') 28 | yield json.loads(line) 29 | 30 | def close(self): 31 | try: 32 | self._file_handler.close() 33 | except: 34 | pass 35 | -------------------------------------------------------------------------------- /rltk/io/reader/reader.py: -------------------------------------------------------------------------------- 1 | import io 2 | 3 | 4 | class Reader(object): 5 | """ 6 | Reader. 7 | """ 8 | 9 | def __init__(self): 10 | pass 11 | 12 | def __iter__(self): 13 | """ 14 | Same as :meth:`__next__`. 15 | """ 16 | return self.__next__() 17 | 18 | def __next__(self): 19 | """ 20 | Iterator. 21 | 22 | Returns: 23 | iter: `raw_object`. The raw_object is a dict which represents raw data of a logical row. 24 | """ 25 | raise NotImplementedError 26 | 27 | def __del__(self): 28 | """ 29 | Same as :meth:`close` 30 | """ 31 | 32 | def close(self): 33 | """ 34 | Close handler. 35 | """ 36 | pass 37 | -------------------------------------------------------------------------------- /rltk/io/serializer/__init__.py: -------------------------------------------------------------------------------- 1 | from rltk.io.serializer.serializer import Serializer 2 | from rltk.io.serializer.pickle_serializer import PickleSerializer 3 | -------------------------------------------------------------------------------- /rltk/io/serializer/pickle_serializer.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | from rltk.io.serializer import Serializer 4 | 5 | 6 | class PickleSerializer(Serializer): 7 | """ 8 | `Pickle serializer `_ . 9 | """ 10 | 11 | def loads(self, string): 12 | return pickle.loads(string) 13 | 14 | def dumps(self, obj): 15 | return pickle.dumps(obj) 16 | -------------------------------------------------------------------------------- /rltk/io/serializer/serializer.py: -------------------------------------------------------------------------------- 1 | class Serializer(object): 2 | """ 3 | Serialize and deserialize object. This is the super class. 4 | """ 5 | 6 | def loads(self, obj): 7 | """ 8 | Load a serialized object. 9 | 10 | Args: 11 | obj (obj): For most of times, it's byte or string. 12 | 13 | Returns: 14 | obj: Python object. 15 | """ 16 | raise NotImplementedError 17 | 18 | def dumps(self, obj): 19 | """ 20 | Serialize the given object. 21 | 22 | Args: 23 | obj (obj): Python object. 24 | 25 | Returns: 26 | obj: Serialized object. 27 | """ 28 | raise NotImplementedError 29 | -------------------------------------------------------------------------------- /rltk/io/writer/__init__.py: -------------------------------------------------------------------------------- 1 | from rltk.io.writer.writer import Writer 2 | from rltk.io.writer.ground_truth_writer import GroundTruthWriter 3 | -------------------------------------------------------------------------------- /rltk/io/writer/ground_truth_writer.py: -------------------------------------------------------------------------------- 1 | import csv 2 | 3 | from rltk.io.writer import Writer 4 | from rltk.io.io_utils import get_file_handler 5 | 6 | 7 | class GroundTruthWriter(Writer): 8 | """ 9 | Ground truth writer. 10 | 11 | Args: 12 | file_handler (io.IOBase): It can be file name or file handler. 13 | """ 14 | def __init__(self, file_handler): 15 | self._file_handler = get_file_handler(file_handler, 'w') 16 | fieldnames = ['id1', 'id2', 'label'] 17 | self._csv_writer = csv.DictWriter(self._file_handler, fieldnames=fieldnames) 18 | self._csv_writer.writeheader() 19 | 20 | def write(self, id1: str, id2: str, label: bool): 21 | """ 22 | Writer to file. 23 | 24 | Args: 25 | id1 (str): Id 1. 26 | id2 (str): Id 2. 27 | label (bool): Positive (True) or negative (False). 28 | """ 29 | self._csv_writer.writerow({'id1': id1, 'id2': id2, 'label': label}) 30 | 31 | def close(self): 32 | try: 33 | self._file_handler.close() 34 | except: 35 | pass 36 | -------------------------------------------------------------------------------- /rltk/io/writer/writer.py: -------------------------------------------------------------------------------- 1 | import io 2 | 3 | 4 | class Writer(object): 5 | """ 6 | Writer. 7 | """ 8 | def __init__(self): 9 | pass 10 | 11 | def write(self): 12 | """ 13 | Write content. 14 | """ 15 | raise NotImplementedError 16 | 17 | def __del__(self): 18 | """ 19 | Same to :meth:`close`. 20 | """ 21 | self.close() 22 | 23 | def close(self): 24 | """ 25 | Close handler. 26 | """ 27 | pass 28 | -------------------------------------------------------------------------------- /rltk/record.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import Callable 3 | 4 | 5 | # Record ID should be string 6 | re_record_id = re.compile(r'^[^*]{1,255}$') 7 | # Valid python's property name 8 | # https://docs.python.org/3.6/reference/lexical_analysis.html#identifiers 9 | re_valid_property_name = re.compile(r'^[A-Za-z_]{1}[\w]*$') 10 | 11 | 12 | class Record(object): 13 | """ 14 | Record representation. Properties should be defined for further usage. 15 | 16 | Args: 17 | raw_object (dict): Raw data which will be used to create properties. 18 | """ 19 | 20 | _remove_raw_object = False 21 | 22 | def __init__(self, raw_object): 23 | self.raw_object = raw_object 24 | 25 | @property 26 | def id(self): 27 | """ 28 | Required property. Type has to be utf-8 string. 29 | """ 30 | raise NotImplementedError 31 | 32 | def __eq__(self, other): 33 | """ 34 | Only if both instances have the same class and id. 35 | 36 | Returns: 37 | bool: Equal or not. 38 | """ 39 | if not isinstance(other, self.__class__): # class should be exactly the same 40 | return False 41 | return self.id == other.id 42 | 43 | 44 | class cached_property(property): 45 | """ 46 | Decorator. 47 | If a Record property is decorated, the final value of it will be pre-calculated. 48 | """ 49 | def __init__(self, func): 50 | self.func = func 51 | 52 | def __get__(self, obj, cls): 53 | """ 54 | Args: 55 | obj (object): Record instance 56 | cls (class): Record class 57 | Returns: 58 | object: cached value 59 | """ 60 | if obj is None: 61 | return self 62 | 63 | # create property if it's not there 64 | cached_name = self.func.__name__ 65 | if cached_name not in obj.__dict__: 66 | obj.__dict__[cached_name] = self.func(obj) 67 | 68 | value = obj.__dict__.get(cached_name) 69 | return value 70 | 71 | def __reduce__(self): 72 | return cached_property.__new__, (cached_property,), {'func': self.func} 73 | 74 | 75 | def remove_raw_object(cls): 76 | """ 77 | Decorator for Record class. 78 | If a Record class is decorated, raw_object will be removed once all mark properties are cached. 79 | """ 80 | cls._remove_raw_object = True 81 | return cls 82 | 83 | 84 | def generate_record_property_cache(obj): 85 | """ 86 | Generate final value on all cached_property decorated methods. 87 | 88 | Args: 89 | obj (Record): Record instance. 90 | """ 91 | for prop_name, prop_type in obj.__class__.__dict__.items(): 92 | if isinstance(prop_type, cached_property): 93 | getattr(obj, prop_name) 94 | 95 | validate_record(obj) 96 | 97 | if obj.__class__._remove_raw_object: 98 | del obj.__dict__['raw_object'] 99 | 100 | 101 | def validate_record(obj): 102 | """ 103 | Property validator of record instance. 104 | 105 | Args: 106 | obj (Record): Record instance. 107 | 108 | Raises: 109 | TypeError: if id is not valid 110 | """ 111 | if not isinstance(obj.id, str): 112 | raise TypeError('Id in {} should be an utf-8 encoded string.'.format(obj.__class__.__name__)) 113 | if not re_record_id.match(obj.id): 114 | raise ValueError('Id is not valid') 115 | 116 | 117 | def get_property_names(cls: type): 118 | """ 119 | Get keys of property and cached_property from a record class. 120 | 121 | Args: 122 | cls (type): Record class 123 | 124 | Returns: 125 | list: Property names in class 126 | """ 127 | keys = [] 128 | for prop_name, prop_type in cls.__dict__.items(): 129 | if not isinstance(prop_type, property) and not isinstance(prop_type, cached_property): 130 | continue 131 | keys.append(prop_name) 132 | return keys 133 | 134 | 135 | def set_id(key: str, function_: Callable = None, keep_original: bool = False): 136 | """ 137 | Decorator for AutoGeneratedRecord class. 138 | If an AutoGeneratedRecord class is decorated, the value of specified key from raw_object will be used as id. 139 | 140 | Args: 141 | key (str): Key in `raw_object`. 142 | function_ (Callable, optional): Function to modify value. Signature is `function_(raw_object[key])`. 143 | Defaults to None. 144 | keep_original (bool, optional): If the original key should be kept. Defaults to False. 145 | """ 146 | def wrapper(cls): 147 | cls._id_key = key 148 | cls._id_function = function_ 149 | cls._id_keep_original = keep_original 150 | return cls 151 | return wrapper 152 | 153 | 154 | class AutoGeneratedRecord(Record): 155 | """ 156 | Properties are auto generated based on the keys in `raw_object`. 157 | 158 | `raw_object` has to contain `id` which used as id in record. 159 | 160 | Args: 161 | raw_object (dict): Raw data which will be used to create properties. 162 | """ 163 | 164 | _id_key = 'id' 165 | _id_function = None 166 | _id_keep_original = False 167 | 168 | def __init__(self, raw_object: dict): 169 | super().__init__(raw_object) 170 | for k in raw_object.keys(): 171 | if k == self.__class__._id_key: 172 | if not self.__class__._id_keep_original: 173 | continue 174 | 175 | if not hasattr(self.__class__, k): 176 | if not re_valid_property_name.match(k): 177 | raise ValueError('Invalid property name') 178 | setattr(self.__class__, k, self.__class__._generate_property(k)) 179 | 180 | @staticmethod 181 | def _generate_property(k): 182 | 183 | @property 184 | def get_value(ins): 185 | return ins.raw_object[k] 186 | 187 | return get_value 188 | 189 | @property 190 | def id(self): 191 | id_ = self.raw_object[self.__class__._id_key] 192 | function_ = self.__class__._id_function 193 | if function_: 194 | id_ = function_(id_) 195 | return id_ 196 | -------------------------------------------------------------------------------- /rltk/remote/__init__.py: -------------------------------------------------------------------------------- 1 | from rltk.remote.remote import Remote 2 | from rltk.remote.task import Task 3 | -------------------------------------------------------------------------------- /rltk/remote/remote.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | from distributed import Worker 4 | 5 | from dask.distributed import Client 6 | from distributed.security import Security 7 | 8 | 9 | class Remote(object): 10 | """ 11 | Remote. 12 | 13 | Args: 14 | address (str): Remote scheduler address formed by `ip:port`. 15 | tls_ca_file (str, optional): TLS CA certificate file path. Defaults to None. 16 | tls_client_cert (str, optional): TLS certificate file path. Defaults to None. 17 | tls_client_key (str, optional): TLS private key file path. Defaults to None. 18 | require_encryption (bool, optional): Encrypt data exchange. Defaults to False. 19 | 20 | Note: 21 | TLS will be enabled only if all three TLS arguments are provided. 22 | Remember to change network protocol to `tls://
`. 23 | """ 24 | def __init__(self, address: str, 25 | tls_ca_file: str = None, tls_client_cert: str = None, tls_client_key: str = None, 26 | require_encryption: bool = False): 27 | # authentication 28 | sec = None 29 | if tls_ca_file and tls_client_cert and tls_client_key: 30 | sec = Security(tls_ca_file=tls_ca_file, 31 | tls_client_cert=tls_client_cert, 32 | tls_client_key=tls_client_key, 33 | require_encryption=require_encryption) 34 | 35 | # init 36 | self._client = Client(address=address, security=sec) 37 | self._client.register_worker_callbacks(Remote._worker_startup) 38 | 39 | @staticmethod 40 | def _worker_startup(dask_worker: Worker): 41 | os.chdir(dask_worker.local_dir) 42 | 43 | def add_dependencies(self, files): 44 | """ 45 | Add list of dependencies, order matters. 46 | 47 | Args: 48 | files (list): List of dependent files. 49 | """ 50 | # TODO: automatically resolve module dependencies 51 | if isinstance(files, str): 52 | files = [files] 53 | for f in files: 54 | self._client.upload_file(f) 55 | 56 | def scatter(self, *args, **kwargs): 57 | """ 58 | Scatter data. 59 | """ 60 | return self._client.scatter(*args, **kwargs) 61 | 62 | def submit(self, func, *args, **kwargs): 63 | """ 64 | Submit function and data. 65 | 66 | Args: 67 | func (callable): User function. 68 | """ 69 | return self._client.submit(func, *args, **kwargs) 70 | 71 | def fetch(self, futures_, **kwargs): 72 | """ 73 | Fetch data of future objects. 74 | 75 | Args: 76 | futures_ (list): Future objects. 77 | """ 78 | return self._client.gather(futures_, **kwargs) 79 | 80 | def cancel(self, futures_, **kwargs): 81 | """ 82 | Cancel job of future objects. 83 | 84 | Args: 85 | futures_ (list): Future objects. 86 | """ 87 | return self._client.cancel(futures_, **kwargs) 88 | 89 | def close(self, *args, **kwargs): 90 | """ 91 | Close connection. 92 | """ 93 | return self._client.close(*args, **kwargs) 94 | 95 | # @staticmethod 96 | # def _list_local_dir(pathname='**', *args, recursive=True): 97 | # non_py_files = [] 98 | # py_files = [] 99 | # for path in glob.glob(pathname, *args, recursive=recursive): 100 | # if os.path.isdir(path): 101 | # if path == '__pycache__': 102 | # continue 103 | # elif os.path.isfile(path): 104 | # if path.endswith('.pyc'): 105 | # continue 106 | # if path.endswith('.py'): 107 | # py_files.append(path) 108 | # else: 109 | # non_py_files.append(path) 110 | # 111 | # return non_py_files + py_files 112 | -------------------------------------------------------------------------------- /rltk/remote/task.py: -------------------------------------------------------------------------------- 1 | from threading import Semaphore 2 | from typing import Callable 3 | 4 | from rltk.remote.remote import Remote 5 | 6 | 7 | class Task(object): 8 | """ 9 | Remote task. It has similar API to :meth:`rltk.ParallelProcessor`. 10 | But do not use :meth:`rltk.ParallelProcessor` if this module is used. If you still want multiprocessing, 11 | please give each worker more processes. 12 | 13 | Args: 14 | remote (Remote): Remote object. 15 | input_handler (Callable): Input handler. 16 | output_handler (Callable): Output handler. It accepts same number of arguments to `input_handler` 's return values. 17 | chunk_size (int, optional): Size of the each data chunk. Defaults to 1000. 18 | max_queue_size (int, optional): How many chunks can be in the queue. Defaults to 10. 19 | """ 20 | 21 | def __init__(self, remote: Remote, input_handler: Callable, output_handler: Callable, 22 | chunk_size: int = 1000, max_queue_size: int = 10): 23 | self.remote = remote 24 | self.input_handler = input_handler 25 | self.output_handler = output_handler 26 | 27 | self.chunk_data = [] # buffer 28 | self.chunk_size = chunk_size # buffer size 29 | self.future_semaphore = Semaphore(value=max_queue_size) # max num of un-return futures 30 | self.all_futures = set([]) # all un-return future objects 31 | self.done = False 32 | 33 | def start(self): 34 | """ 35 | Start listening. 36 | """ 37 | pass 38 | 39 | @staticmethod 40 | def _parse_input(input_handler, data): 41 | return [input_handler(*args, **kwargs) for args, kwargs in data] 42 | 43 | def _parse_output(self, future): 44 | if future.done(): 45 | for r in future.result(): 46 | if not isinstance(r, tuple): 47 | r = (r,) 48 | self.output_handler(*r) 49 | 50 | # release resources no matter what condition that future gets 51 | self.all_futures.remove(future) 52 | self.future_semaphore.release() 53 | 54 | def compute(self, *args, **kwargs): 55 | """ 56 | Add data to compute. 57 | """ 58 | if self.done: 59 | return 60 | 61 | if len(self.chunk_data) < self.chunk_size: 62 | self.chunk_data.append(([*args], {**kwargs})) 63 | if len(self.chunk_data) == self.chunk_size: 64 | self._submit() 65 | 66 | def _submit(self): 67 | if len(self.chunk_data) == 0: 68 | return 69 | 70 | self.future_semaphore.acquire() 71 | 72 | # scatter input data (scatter first if data is large) 73 | data_future = self.remote.scatter(self.chunk_data) 74 | 75 | # input and output must be staticmethod, create wrappers to bypass restriction 76 | future = self.remote.submit(Task._parse_input, self.input_handler, data_future) 77 | Task._parse_output_wrapper = lambda ft: Task._parse_output(self, ft) 78 | 79 | # add listener 80 | future.add_done_callback(Task._parse_output_wrapper) 81 | self.all_futures.add(future) 82 | 83 | self.chunk_data = [] 84 | 85 | def task_done(self): 86 | """ 87 | Indicate that all resources which need to compute are added. 88 | """ 89 | self.done = True 90 | self._submit() # force flush buffer 91 | 92 | def join(self): 93 | """ 94 | Block until all tasks are done. 95 | """ 96 | while len(self.all_futures) != 0: 97 | pass 98 | -------------------------------------------------------------------------------- /rltk/similarity/__init__.py: -------------------------------------------------------------------------------- 1 | # common distance 2 | from rltk.similarity.distance import euclidean_distance, euclidean_similarity, \ 3 | manhattan_distance, manhattan_similarity 4 | 5 | # normal 6 | from rltk.similarity.equal import string_equal, number_equal 7 | from rltk.similarity.hamming import hamming_distance, hamming_similarity, normalized_hamming_distance 8 | from rltk.similarity.dice import dice_similarity 9 | from rltk.similarity.levenshtein import levenshtein_distance, levenshtein_similarity, \ 10 | damerau_levenshtein_distance, damerau_levenshtein_similarity, \ 11 | optimal_string_alignment_distance, optimal_string_alignment_similarity 12 | from rltk.similarity.needleman import needleman_wunsch_score, needleman_wunsch_similarity 13 | from rltk.similarity.jaro import jaro_winkler_distance, jaro_winkler_similarity, jaro_distance 14 | from rltk.similarity.jaccard import jaccard_index_similarity, jaccard_index_distance 15 | from rltk.similarity.cosine import cosine_similarity, string_cosine_similarity 16 | from rltk.similarity.tf_idf import tf_idf_similarity, compute_idf, compute_tf, tf_idf_cosine_similarity, TF_IDF 17 | from rltk.similarity.lcs import longest_common_subsequence_distance, metric_longest_common_subsequence 18 | from rltk.similarity.ngram import ngram_distance, ngram_similarity 19 | from rltk.similarity.qgram import qgram_distance, qgram_similarity 20 | 21 | # # hybrid 22 | from rltk.similarity.hybrid import hybrid_jaccard_similarity, monge_elkan_similarity, symmetric_monge_elkan_similarity 23 | 24 | # # phonetic 25 | from rltk.similarity.soundex import soundex_similarity, soundex 26 | from rltk.similarity.metaphone import metaphone_similarity, metaphone 27 | from rltk.similarity.nysiis import nysiis_similarity, nysiis 28 | -------------------------------------------------------------------------------- /rltk/similarity/cosine.py: -------------------------------------------------------------------------------- 1 | import math 2 | import collections 3 | import rltk.utils as utils 4 | 5 | 6 | def cosine_similarity(vec1, vec2): 7 | """ 8 | The cosine similarity between to vectors. 9 | 10 | Args: 11 | vec1 (list): Vector 1. List of integer or float. 12 | vec2 (list): Vector 2. List of integer or float. It should have the same length to vec1. 13 | 14 | Returns: 15 | float: Cosine similarity. 16 | 17 | Examples: 18 | >>> rltk.cosine_similarity([1, 2, 1, 3], [2, 5, 2, 3]) 19 | 0.91634193 20 | """ 21 | 22 | utils.check_for_none(vec1, vec2) 23 | utils.check_for_type(list, vec1, vec2) 24 | if len(vec1) != len(vec2): 25 | raise ValueError('vec1 and vec2 should have same length') 26 | 27 | v_x_y, v_x_2, v_y_2 = 0.0, 0.0, 0.0 28 | for v1, v2 in zip(vec1, vec2): # list of int / float 29 | v_x_y += v1 * v2 30 | v_x_2 += v1 * v1 31 | v_y_2 += v2 * v2 32 | 33 | return 0.0 if v_x_y == 0 else v_x_y / (math.sqrt(v_x_2) * math.sqrt(v_y_2)) 34 | 35 | 36 | def string_cosine_similarity(bag1, bag2): 37 | """ 38 | The similarity between the two strings is the cosine of the angle between these two vectors representation. 39 | 40 | Args: 41 | bag1 (list): Bag1, tokenized string sequence. 42 | bag2 (list): Bag2, tokenized string sequence. 43 | 44 | Returns: 45 | float: Cosine similarity. 46 | """ 47 | 48 | utils.check_for_none(bag1, bag2) 49 | utils.check_for_type(list, bag1, bag2) 50 | 51 | d1 = collections.Counter(bag1) 52 | d2 = collections.Counter(bag2) 53 | 54 | intersection = set(d1.keys()) & set(d2.keys()) 55 | v_x_y = sum([d1[x] * d2[x] for x in intersection]) 56 | v_x_2 = sum([v * v for k, v in d1.items()]) 57 | v_y_2 = sum([v * v for k, v in d2.items()]) 58 | 59 | return 0.0 if v_x_y == 0 else float(v_x_y) / (math.sqrt(v_x_2) * math.sqrt(v_y_2)) 60 | -------------------------------------------------------------------------------- /rltk/similarity/dice.py: -------------------------------------------------------------------------------- 1 | import rltk.utils as utils 2 | 3 | 4 | def dice_similarity(set1, set2): 5 | """ 6 | The Dice similarity score is defined as twice the intersection of two sets divided by sum of lengths. 7 | 8 | Args: 9 | set1 (set): Set 1. 10 | set2 (set): Set 2. 11 | 12 | Returns: 13 | float: Dice similarity. 14 | 15 | Examples: 16 | >>> rltk.dice_similarity(set(['a', 'b']), set(['c', 'b'])) 17 | 0.5 18 | """ 19 | 20 | utils.check_for_none(set1, set2) 21 | utils.check_for_type(set, set1, set2) 22 | 23 | if len(set1) == 0 or len(set2) == 0: 24 | return 0 25 | 26 | return 2.0 * float(len(set1 & set2)) / float(len(set1) + len(set2)) 27 | -------------------------------------------------------------------------------- /rltk/similarity/distance.py: -------------------------------------------------------------------------------- 1 | # https://docs.scipy.org/doc/scipy-0.14.0/reference/spatial.distance.html 2 | from scipy.spatial.distance import euclidean, cityblock 3 | 4 | import rltk.utils as utils 5 | 6 | 7 | def euclidean_distance(vec1, vec2, weights=None): 8 | """ 9 | Euclidean distance. 10 | 11 | Args: 12 | vec1 (list): Vector 1. List of integer or float. 13 | vec2 (list): Vector 2. List of integer or float. It should have the same length to vec1. 14 | weights (list): Weights for each value in vectors. If it's None, all weights will be 1.0. Defaults to None. 15 | 16 | Returns: 17 | float: Euclidean distance. 18 | """ 19 | 20 | utils.check_for_none(vec1, vec2) 21 | utils.check_for_type(list, vec1, vec2) 22 | if weights: 23 | utils.check_for_type(list, weights) 24 | if len(vec1) != len(vec2): 25 | raise ValueError('vec1 and vec2 should have same length') 26 | 27 | return euclidean(vec1, vec2, weights) 28 | 29 | 30 | def euclidean_similarity(vec1, vec2, weights=None): 31 | """ 32 | Computed as 1 / (1 + euclidean_distance) 33 | """ 34 | return 1.0 / (1.0 + float(euclidean_distance(vec1, vec2, weights))) 35 | 36 | 37 | def manhattan_distance(vec1, vec2, weights=None): 38 | """ 39 | Manhattan distance. 40 | 41 | Args: 42 | vec1 (list): Vector 1. List of integer or float. 43 | vec2 (list): Vector 2. List of integer or float. It should have the same length to vec1. 44 | weights (list): Weights for each value in vectors. If it's None, all weights will be 1.0. Defaults to None. 45 | 46 | Returns: 47 | float: Manhattan distance. 48 | """ 49 | utils.check_for_none(vec1, vec2) 50 | utils.check_for_type(list, vec1, vec2) 51 | if weights: 52 | utils.check_for_type(list, weights) 53 | if len(vec1) != len(vec2): 54 | raise ValueError('vec1 and vec2 should have same length') 55 | 56 | return cityblock(vec1, vec2, weights) 57 | 58 | 59 | def manhattan_similarity(vec1, vec2, weights=None): 60 | """ 61 | Computed as 1 / (1 + manhattan_distance) 62 | """ 63 | return 1.0 / (1.0 + manhattan_distance(vec1, vec2, weights)) 64 | -------------------------------------------------------------------------------- /rltk/similarity/equal.py: -------------------------------------------------------------------------------- 1 | import rltk.utils as utils 2 | 3 | 4 | def string_equal(str1, str2): 5 | """ 6 | Args: 7 | n1 (str): String 1. 8 | n2 (str): String 2. 9 | 10 | Returns: 11 | int: 0 for unequal and 1 for equal. 12 | """ 13 | 14 | utils.check_for_none(str1, str2) 15 | utils.check_for_type(str, str1, str2) 16 | return int(str1 == str2) 17 | 18 | 19 | def number_equal(num1, num2, epsilon=0): 20 | """ 21 | Args: 22 | n1 (int / float): Number 1. 23 | n2 (int / float): Number 2. 24 | epsilon (float, optional): Approximation margin. 25 | 26 | Returns: 27 | int: 0 for unequal and 1 for equal. 28 | """ 29 | 30 | utils.check_for_type((int, float), num1, num2) 31 | return int(abs(num1 - num2) <= epsilon) 32 | -------------------------------------------------------------------------------- /rltk/similarity/hamming.py: -------------------------------------------------------------------------------- 1 | import rltk.utils as utils 2 | 3 | 4 | def hamming_distance(s1, s2): 5 | """ 6 | Hamming distance used to measure the minimum number of substitutions required to change one sequence into the 7 | other. 8 | 9 | Args: 10 | s1 (str or list): Sequence 1. 11 | s2 (str or list): Sequence 2. 12 | 13 | Returns: 14 | int: Hamming distance between two sequences. 15 | 16 | Examples: 17 | >>> rltk.hamming_distance('ab','cd') 18 | 2 19 | >>> rltk.hamming_distance([1,2,3],[3,2,3]) 20 | 1 21 | """ 22 | 23 | utils.check_for_none(s1, s2) 24 | # utils.check_for_type(str, s1, s2) 25 | 26 | if len(s1) != len(s2): 27 | raise ValueError('Unequal length') 28 | 29 | return sum(c1 != c2 for c1, c2 in zip(s1, s2)) 30 | 31 | 32 | def normalized_hamming_distance(s1, s2): 33 | 34 | max_len = max(len(s1), len(s2)) 35 | if max_len == 0: 36 | return 0 37 | 38 | distance = hamming_distance(s1, s2) 39 | return float(distance) / max_len 40 | 41 | 42 | def hamming_similarity(s1, s2): 43 | """ 44 | Hamming similarity is computed as 1 - normalized_hamming_distance. 45 | 46 | Args: 47 | s1 (str or list): Sequence 1. 48 | s2 (str or list): Sequence 2. 49 | 50 | Returns: 51 | float: Hamming similarity. 52 | 53 | Examples: 54 | >>> rltk.hamming_similarity('ab','cd') 55 | 0 56 | >>> rltk.hamming_similarity([1,2,3],[3,2,3]) 57 | 0.666666666667 58 | """ 59 | 60 | return 1 - normalized_hamming_distance(s1, s2) 61 | -------------------------------------------------------------------------------- /rltk/similarity/hybrid.py: -------------------------------------------------------------------------------- 1 | from scipy.optimize import linear_sum_assignment 2 | import rltk.utils as utils 3 | from rltk.similarity.jaro import jaro_winkler_similarity 4 | 5 | 6 | def hybrid_jaccard_similarity(set1, set2, threshold=0.5, function=jaro_winkler_similarity, 7 | parameters=None, lower_bound=None): 8 | """ 9 | Generalized Jaccard Measure. 10 | 11 | Args: 12 | set1 (set): Set 1. 13 | set2 (set): Set 2. 14 | threshold (float, optional): The threshold to keep the score of similarity function. \ 15 | Defaults to 0.5. 16 | function (function, optional): The reference of a similarity measure function. \ 17 | It should return the value in range [0,1]. If it is set to None, \ 18 | `jaro_winlker_similarity` will be used. 19 | parameters (dict, optional): Other parameters of function. Defaults to None. 20 | lower_bound (float): This is for early exit. If the similarity is not possible to satisfy this value, \ 21 | the function returns immediately with the return value 0.0. Defaults to None. 22 | 23 | Returns: 24 | float: Hybrid Jaccard similarity. 25 | 26 | Examples: 27 | >>> def hybrid_test_similarity(m ,n): 28 | ... ... 29 | >>> rltk.hybrid_jaccard_similarity(set(['a','b','c']), set(['p', 'q']), function=hybrid_test_similarity) 30 | 0.533333333333 31 | """ 32 | 33 | utils.check_for_none(set1, set2) 34 | utils.check_for_type(set, set1, set2) 35 | 36 | parameters = parameters if isinstance(parameters, dict) else {} 37 | 38 | if len(set1) > len(set2): 39 | set1, set2 = set2, set1 40 | total_num_matches = len(set1) 41 | 42 | matching_score = [[1.0] * len(set2) for _ in range(len(set1))] 43 | row_max = [0.0] * len(set1) 44 | for i, s1 in enumerate(set1): 45 | for j, s2 in enumerate(set2): 46 | score = function(s1, s2, **parameters) 47 | if score < threshold: 48 | score = 0.0 49 | row_max[i] = max(row_max[i], score) 50 | matching_score[i][j] = 1.0 - score # munkres finds out the smallest element 51 | 52 | if lower_bound: 53 | max_possible_score_sum = sum(row_max[:i+1] + [1] * (total_num_matches - i - 1)) 54 | max_possible = 1.0 * max_possible_score_sum / float(len(set1) + len(set2) - total_num_matches) 55 | if max_possible < lower_bound: 56 | return 0.0 57 | 58 | # run munkres, finds the min score (max similarity) for each row 59 | row_idx, col_idx = linear_sum_assignment(matching_score) 60 | 61 | # recover scores 62 | score_sum = 0.0 63 | for r, c in zip(row_idx, col_idx): 64 | score_sum += 1.0 - matching_score[r][c] 65 | 66 | if len(set1) + len(set2) - total_num_matches == 0: 67 | return 1.0 68 | sim = float(score_sum) / float(len(set1) + len(set2) - total_num_matches) 69 | if lower_bound and sim < lower_bound: 70 | return 0.0 71 | return sim 72 | 73 | 74 | def monge_elkan_similarity(bag1, bag2, function=jaro_winkler_similarity, parameters=None, lower_bound=None): 75 | """ 76 | Monge Elkan similarity. 77 | 78 | Args: 79 | bag1 (list): Bag 1. 80 | bag2 (list): Bag 2. 81 | function (function, optional): The reference of a similarity measure function. \ 82 | It should return the value in range [0,1]. If it is set to None, \ 83 | `jaro_winlker_similarity` will be used. 84 | parameters (dict, optional): Other parameters of function. Defaults to None. 85 | lower_bound (float): This is for early exit. If the similarity is not possible to satisfy this value, \ 86 | the function returns immediately with the return value 0.0. Defaults to None. 87 | 88 | Returns: 89 | float: Monge Elkan similarity. 90 | 91 | Note: 92 | The order of bag1 and bag2 matters. \ 93 | Alternatively, `symmetric_monge_elkan_similarity` is not sensitive to the order. 94 | If the `lower_bound` is set, the early exit condition is more easy to be triggered if bag1 has bigger size. 95 | """ 96 | 97 | utils.check_for_none(bag1, bag2) 98 | utils.check_for_type(list, bag1, bag2) 99 | 100 | parameters = parameters if isinstance(parameters, dict) else {} 101 | 102 | score_sum = 0 103 | for idx, ele1 in enumerate(bag1): 104 | max_score = utils.MIN_FLOAT 105 | for ele2 in bag2: 106 | max_score = max(max_score, function(ele1, ele2, **parameters)) 107 | score_sum += max_score 108 | 109 | # if it satisfies early exit condition 110 | if lower_bound: 111 | rest_max = len(bag1) - 1 - idx # assume the rest scores are all 1 112 | if float(score_sum + rest_max) / float(len(bag1)) < lower_bound: 113 | return 0.0 114 | 115 | sim = float(score_sum) / float(len(bag1)) 116 | if lower_bound and sim < lower_bound: 117 | return 0.0 118 | return sim 119 | 120 | 121 | def symmetric_monge_elkan_similarity(bag1, bag2, function=jaro_winkler_similarity, parameters=None, lower_bound=None): 122 | """ 123 | Symmetric Monge Elkan similarity is computed by \ 124 | (monge_elkan_similarity(b1, b2) + monge_elkan_similarity(b2, b1)) / 2. 125 | 126 | Note: 127 | If `lower_bound` is given, the return will be zero unless \ 128 | both `monge_elkan_similarity`s are greater than it. 129 | """ 130 | 131 | s1 = monge_elkan_similarity(bag1, bag2, function, parameters, lower_bound=lower_bound) 132 | if lower_bound and s1 == 0: 133 | return 0.0 134 | s2 = monge_elkan_similarity(bag2, bag1, function, parameters, lower_bound=lower_bound) 135 | if lower_bound and s2 == 0: 136 | return 0.0 137 | return (s1 + s2) / 2 138 | -------------------------------------------------------------------------------- /rltk/similarity/jaccard.py: -------------------------------------------------------------------------------- 1 | import rltk.utils as utils 2 | 3 | 4 | def _jaccard_index(set1, set2): 5 | utils.check_for_none(set1, set2) 6 | utils.check_for_type(set, set1, set2) 7 | 8 | if len(set1) == 0 or len(set2) == 0: 9 | return 0 10 | 11 | # return float(len(set1 & set2)) / float(len(set1 | set2)) 12 | 13 | inter_len = len(set1 & set2) 14 | return float(inter_len) / (len(set1) + len(set2) - inter_len) 15 | 16 | 17 | def jaccard_index_similarity(set1, set2): 18 | """ 19 | The Jaccard Index Similarity is then computed as intersection(set1, set2) / union(set1, set2). 20 | 21 | Args: 22 | set1 (set): Set 1. 23 | set2 (set): Set 2. 24 | 25 | Returns: 26 | float: Jaccard Index similarity. 27 | 28 | Examples: 29 | >>> rltk.jaccard_index_similarity(set(['a','b']), set(['a','c'])) 30 | 0.3333333333333333 31 | >>> rltk.jaccard_index_similarity(set(['a','b']), set(['c','d'])) 32 | 0.0 33 | """ 34 | return _jaccard_index(set1, set2) 35 | 36 | 37 | def jaccard_index_distance(set1, set2): 38 | """ 39 | The Jaccard Index Distance is then computed as 1 - jaccard_index_similarity. 40 | 41 | Args: 42 | set1 (set): Set 1. 43 | set2 (set): Set 2. 44 | 45 | Returns: 46 | int: Jaccard Index Distance. 47 | """ 48 | return 1 - jaccard_index_similarity(set1, set2) 49 | -------------------------------------------------------------------------------- /rltk/similarity/jaro.py: -------------------------------------------------------------------------------- 1 | import math 2 | import rltk.utils as utils 3 | 4 | 5 | def _jaro_winkler(s1, s2, threshold=0.7, scaling_factor=0.1, prefix_len=4): 6 | jaro = _jaro_distance(s1, s2) 7 | if jaro > threshold: 8 | l = min(len(_get_prefix(s1, s2)), prefix_len) # max len of common prefix is 4 9 | jaro += (scaling_factor * l * (1.0 - jaro)) 10 | return jaro 11 | 12 | 13 | def jaro_winkler_similarity(s1, s2, threshold=0.7, scaling_factor=0.1, prefix_len=4): 14 | """ 15 | The max length for common prefix is 4. 16 | 17 | Args: 18 | s1 (str): Sequence 1. 19 | s2 (str): Sequence 2. 20 | threshold (int, optional): Boost threshold, prefix bonus is only added when compared strings have a Jaro Distance above it. Defaults to 0.7. 21 | scaling_factor (int, optional): Scaling factor for how much the score is adjusted upwards for having common prefixes. Defaults to 0.1. 22 | 23 | Returns: 24 | float: Jaro Winkler Similarity. 25 | 26 | Examples: 27 | >>> rltk.jaro_winkler_similarity('abchello', 'abcworld') 28 | 0.6833333333333332 29 | >>> rltk.jaro_winkler_similarity('hello', 'world') 30 | 0.4666666666666666 31 | """ 32 | return _jaro_winkler(s1, s2, threshold, scaling_factor, prefix_len) 33 | 34 | 35 | def jaro_winkler_distance(s1, s2, threshold=0.7, scaling_factor=0.1, prefix_len=4): 36 | """ 37 | Jaro Winkler Distance is computed as 1 - jaro_winkler_similarity. 38 | 39 | Args: 40 | s1 (str): Sequence 1. 41 | s2 (str): Sequence 2. 42 | threshold (int, optional): Boost threshold, prefix bonus is only added when compared strings have a Jaro Distance above it. Defaults to 0.7. 43 | scaling_factor (int, optional): Scaling factor for how much the score is adjusted upwards for having common prefixes. Defaults to 0.1. 44 | 45 | Returns: 46 | float: Jaro Winkler Similarity. 47 | 48 | Examples: 49 | >>> rltk.jaro_winkler_similarity('abchello', 'abcworld') 50 | 0.6833333333333332 51 | >>> rltk.jaro_winkler_similarity('hello', 'world') 52 | 0.4666666666666666 53 | """ 54 | return 1 - _jaro_winkler(s1, s2, threshold, scaling_factor, prefix_len) 55 | 56 | 57 | def jaro_distance(s1, s2): 58 | """ 59 | Args: 60 | s1 (str): Sequence 1. 61 | s2 (str): Sequence 2. 62 | 63 | Returns: 64 | float: Jaro Distance. 65 | 66 | Examples: 67 | >>> rltk.jaro_distance('abc', 'abd') 68 | 0.7777777777777777 69 | >>> rltk.jaro_distance('abccd', 'abcdc') 70 | 0.9333333333333332 71 | """ 72 | return _jaro_distance(s1, s2) 73 | 74 | 75 | def _jaro_distance(s1, s2): 76 | # code from https://github.com/nap/jaro-winkler-distance 77 | # Copyright Jean-Bernard Ratte 78 | 79 | utils.check_for_none(s1, s2) 80 | utils.check_for_type(str, s1, s2) 81 | 82 | # s1 = utils.unicode_normalize(s1) 83 | # s2 = utils.unicode_normalize(s2) 84 | 85 | shorter, longer = s1.lower(), s2.lower() 86 | 87 | if len(s1) > len(s2): 88 | longer, shorter = shorter, longer 89 | 90 | m1 = _get_matching_characters(shorter, longer) 91 | m2 = _get_matching_characters(longer, shorter) 92 | 93 | if len(m1) == 0 or len(m2) == 0: 94 | return 0.0 95 | 96 | return (float(len(m1)) / len(shorter) + 97 | float(len(m2)) / len(longer) + 98 | float(len(m1) - _transpositions(m1, m2)) / len(m1)) / 3.0 99 | 100 | 101 | def _get_diff_index(first, second): 102 | if first == second: 103 | return -1 104 | 105 | if not first or not second: 106 | return 0 107 | 108 | max_len = min(len(first), len(second)) 109 | for i in range(0, max_len): 110 | if not first[i] == second[i]: 111 | return i 112 | 113 | return max_len 114 | 115 | 116 | def _get_prefix(first, second): 117 | if not first or not second: 118 | return '' 119 | 120 | index = _get_diff_index(first, second) 121 | if index == -1: 122 | return first 123 | elif index == 0: 124 | return '' 125 | else: 126 | return first[0:index] 127 | 128 | 129 | def _get_matching_characters(first, second): 130 | common = [] 131 | limit = math.floor(min(len(first), len(second)) / 2) 132 | 133 | for i, l in enumerate(first): 134 | left, right = int(max(0, i - limit)), int(min(i + limit + 1, len(second))) 135 | if l in second[left:right]: 136 | common.append(l) 137 | second = second[0:second.index(l)] + '*' + second[second.index(l) + 1:] 138 | 139 | return ''.join(common) 140 | 141 | 142 | def _transpositions(first, second): 143 | return math.floor(len([(f, s) for f, s in zip(first, second) if not f == s]) / 2.0) 144 | -------------------------------------------------------------------------------- /rltk/similarity/lcs.py: -------------------------------------------------------------------------------- 1 | import rltk.utils as utils 2 | 3 | 4 | def _lcs(s1, s2): 5 | m, n = len(s1), len(s2) 6 | 7 | dp = [[None] * (n + 1) for i in range(m + 1)] 8 | 9 | for i in range(m + 1): 10 | for j in range(n + 1): 11 | if i == 0 or j == 0: 12 | dp[i][j] = 0 13 | elif s1[i - 1] == s2[j - 1]: 14 | dp[i][j] = dp[i - 1][j - 1] + 1 15 | else: 16 | dp[i][j] = max(dp[i - 1][j], dp[i][j - 1]) 17 | 18 | return dp[m][n] 19 | 20 | 21 | def longest_common_subsequence_distance(s1, s2): 22 | """ 23 | The LCS distance between strings X (of length n) and Y (of length m) is n + m - 2 |LCS(X, Y)| min = 0 max = n + m 24 | 25 | Args: 26 | s1 (str): Sequence 1. 27 | s2 (str): Sequence 2. 28 | 29 | Returns: 30 | float: Longest Common Subsequence Distance. 31 | 32 | Examples: 33 | >>> rltk.longest_common_subsequence_distance('abcd', 'acbd') 34 | 2 35 | >>> rltk.longest_common_subsequence_distance('abcdefg', 'acef') 36 | 3 37 | """ 38 | utils.check_for_none(s1, s2) 39 | utils.check_for_type(str, s1, s2) 40 | 41 | m, n = len(s1), len(s2) 42 | 43 | # dp = [[None] * (n + 1) for i in range(m + 1)] 44 | 45 | lcs = _lcs(s1, s2) 46 | return n + m - 2 * lcs 47 | 48 | 49 | def metric_longest_common_subsequence(s1, s2): 50 | """ 51 | The Metric LCS distance between 2 strings is similar to LCS between 2 string where Metric Longest Common Subsequence is computed as 1 - |LCS(s1, s2)| / max(|s1|, |s2|) 52 | 53 | Args: 54 | s1 (str): Sequence 1. 55 | s2 (str): Sequence 2. 56 | 57 | Returns: 58 | float: Metric Longest Common Subsequence Distance. 59 | 60 | Examples: 61 | >>> rltk.longest_common_subsequence('ABCDEFG', 'ABCDEFHJKL') 62 | 0.4 63 | # LCS: ABCDEF => length = 6 64 | # longest = s2 => length = 10 65 | # => 1 - 6/10 = 0.4 66 | 67 | >>> rltk.optimal_string_alignment_distance('ABDEF', 'ABDIF') 68 | 4 69 | # LCS: ABDF => length = 4 70 | # longest = ABDEF => length = 5 71 | # => 1 - 4 / 5 = 0.2 72 | """ 73 | utils.check_for_none(s1, s2) 74 | utils.check_for_type(str, s1, s2) 75 | 76 | lcs = _lcs(s1, s2) 77 | return 1 - float(lcs) / max(len(s1), len(s2), 1) 78 | -------------------------------------------------------------------------------- /rltk/similarity/metaphone.py: -------------------------------------------------------------------------------- 1 | import rltk.utils as utils 2 | 3 | 4 | def metaphone(s): 5 | """ 6 | Metaphone fundamentally improves on the Soundex algorithm by using information about variations and inconsistencies in English spelling and pronunciation to produce a more accurate encoding, which does a better job of matching words and names which sound similar. As with Soundex, similar-sounding words should share the same keys. Metaphone is available as a built-in operator in a number of systems. 7 | 8 | Args: 9 | s (str): Sequence. 10 | 11 | Returns: 12 | str: Coded sequence. 13 | 14 | Examples: 15 | >>> rltk.metaphone('ashcraft') 16 | 'AXKRFT' 17 | >>> rltk.metaphone('pineapple') 18 | 'PNPL' 19 | """ 20 | # code from https://github.com/jamesturk/jellyfish 21 | # Copyright (c) 2015, James Turk 22 | # Copyright (c) 2015, Sunlight Foundation 23 | # All rights reserved. 24 | 25 | utils.check_for_none(s) 26 | utils.check_for_type(str, s) 27 | 28 | s = utils.unicode_normalize(s) 29 | 30 | if len(s) == 0: 31 | raise ValueError('Empty string') 32 | 33 | s = s.lower() 34 | result = [] 35 | 36 | # skip first character if s starts with these 37 | if s.startswith(('kn', 'gn', 'pn', 'ac', 'wr', 'ae')): 38 | s = s[1:] 39 | 40 | i = 0 41 | 42 | while i < len(s): 43 | c = s[i] 44 | next_ = s[i+1] if i < len(s)-1 else '*****' 45 | nextnext = s[i+2] if i < len(s)-2 else '*****' 46 | 47 | # skip doubles except for cc 48 | if c == next_ and c != 'c': 49 | i += 1 50 | continue 51 | 52 | if c in 'aeiou': 53 | if i == 0 or s[i-1] == ' ': 54 | result.append(c) 55 | elif c == 'b': 56 | if (not (i != 0 and s[i-1] == 'm')) or next_: 57 | result.append('b') 58 | elif c == 'c': 59 | if next_ == 'i' and nextnext == 'a' or next_ == 'h': 60 | result.append('x') 61 | i += 1 62 | elif next_ in 'iey': 63 | result.append('s') 64 | i += 1 65 | else: 66 | result.append('k') 67 | elif c == 'd': 68 | if next_ == 'g' and nextnext in 'iey': 69 | result.append('j') 70 | i += 2 71 | else: 72 | result.append('t') 73 | elif c in 'fjlmnr': 74 | result.append(c) 75 | elif c == 'g': 76 | if next_ in 'iey': 77 | result.append('j') 78 | elif next_ not in 'hn': 79 | result.append('k') 80 | elif next_ == 'h' and nextnext and nextnext not in 'aeiou': 81 | i += 1 82 | elif c == 'h': 83 | if i == 0 or next_ in 'aeiou' or s[i-1] not in 'aeiou': 84 | result.append('h') 85 | elif c == 'k': 86 | if i == 0 or s[i-1] != 'c': 87 | result.append('k') 88 | elif c == 'p': 89 | if next_ == 'h': 90 | result.append('f') 91 | i += 1 92 | else: 93 | result.append('p') 94 | elif c == 'q': 95 | result.append('k') 96 | elif c == 's': 97 | if next_ == 'h': 98 | result.append('x') 99 | i += 1 100 | elif next_ == 'i' and nextnext in 'oa': 101 | result.append('x') 102 | i += 2 103 | else: 104 | result.append('s') 105 | elif c == 't': 106 | if next_ == 'i' and nextnext in 'oa': 107 | result.append('x') 108 | elif next_ == 'h': 109 | result.append('0') 110 | i += 1 111 | elif next_ != 'c' or nextnext != 'h': 112 | result.append('t') 113 | elif c == 'v': 114 | result.append('f') 115 | elif c == 'w': 116 | if i == 0 and next_ == 'h': 117 | i += 1 118 | if nextnext in 'aeiou' or nextnext == '*****': 119 | result.append('w') 120 | elif c == 'x': 121 | if i == 0: 122 | if next_ == 'h' or (next_ == 'i' and nextnext in 'oa'): 123 | result.append('x') 124 | else: 125 | result.append('s') 126 | else: 127 | result.append('k') 128 | result.append('s') 129 | elif c == 'y': 130 | if next_ in 'aeiou': 131 | result.append('y') 132 | elif c == 'z': 133 | result.append('s') 134 | elif c == ' ': 135 | if len(result) > 0 and result[-1] != ' ': 136 | result.append(' ') 137 | 138 | i += 1 139 | 140 | return ''.join(result).upper() 141 | 142 | 143 | def metaphone_similarity(s1, s2): 144 | """ 145 | metaphone(s1) == metaphone(s2) 146 | 147 | Args: 148 | s1 (str): Sequence. 149 | s2 (str): Sequence. 150 | 151 | Returns: 152 | float: if metaphone(s1) equals to metaphone(s2) 153 | """ 154 | return 1 if metaphone(s1) == metaphone(s2) else 0 155 | -------------------------------------------------------------------------------- /rltk/similarity/needleman.py: -------------------------------------------------------------------------------- 1 | import rltk.utils as utils 2 | 3 | 4 | def _get_score(c1, c2, match, mismatch, score_table): 5 | """ 6 | if there's no score found in score_table, match & mismatch will be used. 7 | """ 8 | if score_table and c1 in score_table and c2 in score_table[c1]: 9 | return score_table[c1][c2] 10 | else: 11 | return match if c1 == c2 else mismatch 12 | 13 | 14 | def needleman_wunsch_score(s1, s2, match=2, mismatch=-1, gap=-0.5, score_table=None): 15 | """ 16 | Neeldman Wunsch score 17 | """ 18 | utils.check_for_none(s1, s2) 19 | utils.check_for_type(str, s1, s2) 20 | 21 | score_table = score_table if isinstance(score_table, dict) else {} 22 | 23 | # s1 = utils.unicode_normalize(s1) 24 | # s2 = utils.unicode_normalize(s2) 25 | 26 | n1, n2 = len(s1), len(s2) 27 | if n1 == 0 and n2 == 0: 28 | return 0 29 | 30 | # construct matrix to get max score of all possible alignments 31 | dp = [[0] * (n2 + 1) for _ in range(n1 + 1)] 32 | for i in range(n1 + 1): 33 | for j in range(n2 + 1): 34 | if i == 0 and j == 0: # [0,0] 35 | continue 36 | elif i == 0: # most top row 37 | dp[i][j] = gap + dp[i][j - 1] 38 | elif j == 0: # most left column 39 | dp[i][j] = gap + dp[i - 1][j] 40 | else: 41 | dp[i][j] = max(dp[i][j - 1] + gap, 42 | dp[i - 1][j] + gap, 43 | dp[i - 1][j - 1] + _get_score(s1[i - 1], s2[j - 1], match, mismatch, score_table)) 44 | 45 | return dp[n1][n2] 46 | 47 | 48 | def needleman_wunsch_similarity(s1, s2, match=2, mismatch=-1, gap=-0.5, score_table=None): 49 | """ 50 | This Needleman Wunsch Similarity is computed as needlman_wunsch_score over maximum score of s1 and s2. 51 | 52 | Args: 53 | s1 (str): Sequence 1. 54 | s2 (str): Sequence 2. 55 | match (int, optional): Score of match. 56 | mismatch (int, optional): Score of mismatch. 57 | gap (int, optional): Gap penalty. 58 | score_dict (dict): Alignment score matrix. Default to None. 59 | 60 | Returns: 61 | float: Needleman Wunsch Similarity. 62 | """ 63 | 64 | nm = needleman_wunsch_score(s1, s2, match, mismatch, gap, score_table) 65 | 66 | # score_table = {'a': {'c': 3}, 'e': {'f': 9, 'k': 1}} 67 | score_s1 = sum([_get_score(c1, c1, match, mismatch, score_table) for c1 in s1]) 68 | score_s2 = sum([_get_score(c2, c2, match, mismatch, score_table) for c2 in s2]) 69 | 70 | max_score = max(score_s1, score_s2) 71 | 72 | if max_score < nm: 73 | raise ValueError('Illegal value of score_table') 74 | 75 | return float(nm) / max_score 76 | -------------------------------------------------------------------------------- /rltk/similarity/ngram.py: -------------------------------------------------------------------------------- 1 | import rltk.utils as utils 2 | 3 | 4 | def ngram_distance(s0, s1, n=2): 5 | """ 6 | N-Gram Distance as defined by Kondrak, "N-Gram Similarity and Distance" String Processing and Information Retrieval, Lecture Notes in Computer Science Volume 3772, 2005, pp 115-126. 7 | 8 | Args: 9 | s1 (str): Sequence 1. 10 | s2 (str): Sequence 2. 11 | 12 | Returns: 13 | float: NGram Distance. 14 | 15 | Examples: 16 | >>> rltk.ngram_distance('ABCD', 'ABTUIO') 17 | 0.5833 18 | """ 19 | 20 | utils.check_for_none(s0, s1) 21 | utils.check_for_type(str, s0, s1) 22 | 23 | n1, n2 = len(s0), len(s1) 24 | special = "\n" 25 | 26 | if (n1 == 0 or n2 == 0): 27 | return 1 28 | 29 | if (s0 == s1): 30 | return 0 31 | 32 | cost = 0 33 | if (n1 < n or n2 < n): 34 | return 1 35 | 36 | # Adding special chars (n-1) to s0 37 | sa = special * (n - 1) + s0 38 | 39 | s2_j = [None] * n # jth n-gram of s2 40 | d = [0] * (n1 + 1) # cost array, horizontally 41 | p = [0] * (n1 + 1) # 'previous' cost array, horizontally 42 | 43 | for i in range(n1 + 1): 44 | p[i] = i 45 | 46 | for j in range(1, n2 + 1): 47 | # Construct s2_j n-gram 48 | if (j < n): 49 | for ti in range(n - j): 50 | s2_j[ti] = special 51 | 52 | for ti in range(n - j, n): 53 | s2_j[ti] = s1[ti - (n - j)] 54 | 55 | else: 56 | s2_j = list(s1[j - n: j]) 57 | 58 | d[0] = j 59 | 60 | for i in range(1, n1 + 1): 61 | cost = 0 62 | tn = n 63 | # Compare sa to s2_j 64 | for ni in range(n): 65 | if sa[i - 1 + ni] != s2_j[ni]: 66 | cost += 1 67 | elif sa[i - 1 + ni] == special: 68 | tn -= 1 69 | 70 | ec = float(cost) / tn 71 | # minimum of cell to the left+1, to the top+1, 72 | # diagonally left and up +cost 73 | d[i] = min(d[i - 1] + 1, p[i] + 1, p[i - 1] + ec) 74 | 75 | d2 = p 76 | p = d 77 | d = d2 78 | return float(p[n1]) / max(n2, n1) 79 | 80 | 81 | def ngram_similarity(s0, s1, n=2): 82 | """ 83 | N-Gram Similarity as defined by Kondrak, "N-Gram Similarity and Distance" String Processing and Information Retrieval, Lecture Notes in Computer Science Volume 3772, 2005, pp 115-126. 84 | 85 | Args: 86 | s1 (str): Sequence 1. 87 | s2 (str): Sequence 2. 88 | 89 | Returns: 90 | float: NGram Similarity. 91 | 92 | Examples: 93 | >>> rltk.ngram_similarity('ABCD', 'ABTUIO') 94 | 0.4166666666666667 95 | """ 96 | 97 | utils.check_for_none(s0, s1) 98 | utils.check_for_type(str, s0, s1) 99 | 100 | n1, n2 = len(s0), len(s1) 101 | special = "\n" 102 | 103 | if (n1 == 0 or n2 == 0): 104 | return 0 105 | 106 | if (s0 == s1): 107 | return 1 108 | 109 | cost = 0 110 | if (n1 < n or n2 < n): 111 | return 0 112 | 113 | # Adding special chars (n-1) to s0 114 | sa = special * (n - 1) + s0 115 | 116 | s2_j = [None] * n # jth n-gram of s2 117 | d = [0] * (n1 + 1) # cost array, horizontally 118 | p = [0] * (n1 + 1) # 'previous' cost array, horizontally 119 | 120 | for i in range(n1 + 1): 121 | p[i] = 0 122 | 123 | for j in range(1, n2 + 1): 124 | # Construct s2_j n-gram 125 | if (j < n): 126 | for ti in range(n - j): 127 | s2_j[ti] = special 128 | 129 | for ti in range(n - j, n): 130 | s2_j[ti] = s1[ti - (n - j)] 131 | 132 | else: 133 | s2_j = list(s1[j - n: j]) 134 | 135 | d[0] = 0 136 | 137 | for i in range(1, n1 + 1): 138 | cost = 0 139 | tn = n 140 | # Compare sa to s2_j 141 | for ni in range(n): 142 | if sa[i - 1 + ni] == s2_j[ni] and sa[i - 1 + ni] != "\n": 143 | cost += 1 144 | elif sa[i - 1 + ni] == special: 145 | tn -= 1 146 | 147 | ec = float(cost) / tn 148 | # minimum of cell to the left+1, to the top+1, 149 | # diagonally left and up +cost 150 | d[i] = max(d[i - 1], p[i], p[i - 1] + ec) 151 | 152 | d2 = p 153 | p = d 154 | d = d2 155 | return float(p[n1]) / max(n2, n1) 156 | -------------------------------------------------------------------------------- /rltk/similarity/nysiis.py: -------------------------------------------------------------------------------- 1 | import rltk.utils as utils 2 | 3 | 4 | def nysiis(s): 5 | """ 6 | New York State Immunization Information System (NYSIIS) Phonetic Code is a phonetic algorithm created by `The New York State Department of Health's (NYSDOH) Bureau of Immunization 7 | `_. 8 | 9 | Args: 10 | s1 (str): Sequence 1. 11 | s2 (str): Sequence 2. 12 | 13 | Returns: 14 | int: 1 for same NYSIIS code, 0 for different. 15 | 16 | Examples: 17 | >>> rltk.nysiis_similarity('ashcraft', 'pineapple') 18 | 0 19 | """ 20 | # code from https://github.com/jamesturk/jellyfish 21 | # Copyright (c) 2015, James Turk 22 | # Copyright (c) 2015, Sunlight Foundation 23 | # All rights reserved. 24 | 25 | utils.check_for_none(s) 26 | utils.check_for_type(str, s) 27 | 28 | s = utils.unicode_normalize(s) 29 | 30 | if len(s) == 0: 31 | raise ValueError('Empty string') 32 | 33 | s = s.upper() 34 | key = [] 35 | 36 | # step 1 - prefixes 37 | if s.startswith('MAC'): 38 | s = 'MCC' + s[3:] 39 | elif s.startswith('KN'): 40 | s = s[1:] 41 | elif s.startswith('K'): 42 | s = 'C' + s[1:] 43 | elif s.startswith(('PH', 'PF')): 44 | s = 'FF' + s[2:] 45 | elif s.startswith('SCH'): 46 | s = 'SSS' + s[3:] 47 | 48 | # step 2 - suffixes 49 | if s.endswith(('IE', 'EE')): 50 | s = s[:-2] + 'Y' 51 | elif s.endswith(('DT', 'RT', 'RD', 'NT', 'ND')): 52 | s = s[:-2] + 'D' 53 | 54 | # step 3 - first character of key comes from name 55 | key.append(s[0]) 56 | 57 | # step 4 - translate remaining chars 58 | i = 1 59 | len_s = len(s) 60 | while i < len_s: 61 | ch = s[i] 62 | if ch == 'E' and i + 1 < len_s and s[i + 1] == 'V': 63 | ch = 'AF' 64 | i += 1 65 | elif ch in 'AEIOU': 66 | ch = 'A' 67 | elif ch == 'Q': 68 | ch = 'G' 69 | elif ch == 'Z': 70 | ch = 'S' 71 | elif ch == 'M': 72 | ch = 'N' 73 | elif ch == 'K': 74 | if i + 1 < len(s) and s[i + 1] == 'N': 75 | ch = 'N' 76 | else: 77 | ch = 'C' 78 | elif ch == 'S' and s[i + 1:i + 3] == 'CH': 79 | ch = 'SS' 80 | i += 2 81 | elif ch == 'P' and i + 1 < len(s) and s[i + 1] == 'H': 82 | ch = 'F' 83 | i += 1 84 | elif ch == 'H' and (s[i - 1] not in 'AEIOU' or (i + 1 < len(s) and s[i + 1] not in 'AEIOU')): 85 | if s[i - 1] in 'AEIOU': 86 | ch = 'A' 87 | else: 88 | ch = s[i - 1] 89 | elif ch == 'W' and s[i - 1] in 'AEIOU': 90 | ch = s[i - 1] 91 | 92 | if ch[-1] != key[-1][-1]: 93 | key.append(ch) 94 | 95 | i += 1 96 | 97 | key = ''.join(key) 98 | 99 | # step 5 - remove trailing S 100 | if key.endswith('S') and key != 'S': 101 | key = key[:-1] 102 | 103 | # step 6 - replace AY w/ Y 104 | if key.endswith('AY'): 105 | key = key[:-2] + 'Y' 106 | 107 | # step 7 - remove trailing A 108 | if key.endswith('A') and key != 'A': 109 | key = key[:-1] 110 | 111 | # step 8 was already done 112 | 113 | return key 114 | 115 | 116 | def nysiis_similarity(s1, s2): 117 | """ 118 | nysiis(s1) == nysiis(s2) 119 | 120 | Args: 121 | s1 (str): Sequence. 122 | s2 (str): Sequence. 123 | 124 | Returns: 125 | float: if nysiis(s1) equals to nysiis(s2) 126 | """ 127 | return 1 if nysiis(s1) == nysiis(s2) else 0 128 | -------------------------------------------------------------------------------- /rltk/similarity/qgram.py: -------------------------------------------------------------------------------- 1 | import rltk.utils as utils 2 | 3 | 4 | def get_ngrams(s, n): 5 | all_ngrams = list() 6 | for i in range(len(s) - 1): 7 | if i + n <= len(s): 8 | all_ngrams.append(s[i:i + n]) 9 | 10 | return set(all_ngrams) 11 | 12 | 13 | def qgram_distance(s0, s1, n=2): 14 | """ 15 | QGram Distance is the number of distinct q-grams (n-grams) between 2 strings 16 | 17 | Args: 18 | s1 (str): Sequence 1. 19 | s2 (str): Sequence 2. 20 | 21 | Returns: 22 | float: QGram Distance. 23 | 24 | Examples: 25 | >>> rltk.qgram_distance('abcde','abdcde') 26 | 3 27 | """ 28 | if n > max(len(s0), len(s1)): 29 | return 1 30 | 31 | s0_ngrams = get_ngrams(s0, n) 32 | s1_ngrams = get_ngrams(s1, n) 33 | all_ngrams = list(s0_ngrams | s1_ngrams) 34 | 35 | v0 = [1 if all_ngrams[i] in s0 else 0 for i in range(len(all_ngrams))] 36 | v1 = [1 if all_ngrams[i] in s1 else 0 for i in range(len(all_ngrams))] 37 | 38 | return sum([1 if v0[i] != v1[i] else 0 for i in range(len(v0))]) 39 | 40 | 41 | def qgram_similarity(s0, s1, n=2): 42 | """ 43 | QGram Similarity is the number of common q-grams (n-grams) between 2 strings 44 | 45 | Args: 46 | s1 (str): Sequence 1. 47 | s2 (str): Sequence 2. 48 | 49 | Returns: 50 | float: QGram Similarity. 51 | 52 | Examples: 53 | >>> rltk.qgram_similarity('abcde','abdcde') 54 | 3 55 | """ 56 | 57 | if n > max(len(s0), len(s1)): 58 | return 0 59 | 60 | s0_ngrams = get_ngrams(s0, n) 61 | s1_ngrams = get_ngrams(s1, n) 62 | all_ngrams = list(s0_ngrams | s1_ngrams) 63 | 64 | v0 = [1 if all_ngrams[i] in s0 else 0 for i in range(len(all_ngrams))] 65 | v1 = [1 if all_ngrams[i] in s1 else 0 for i in range(len(all_ngrams))] 66 | 67 | return sum([1 if v0[i] == v1[i] else 0 for i in range(len(v0))]) 68 | -------------------------------------------------------------------------------- /rltk/similarity/soundex.py: -------------------------------------------------------------------------------- 1 | import rltk.utils as utils 2 | 3 | 4 | def soundex(s): 5 | """ 6 | The standard used for this implementation is provided by `U.S. Census Bureau `_. 7 | 8 | Args: 9 | s (str): Sequence. 10 | 11 | Returns: 12 | str: Coded sequence. 13 | 14 | Examples: 15 | >>> rltk.soundex('ashcraft') 16 | 'A261' 17 | >>> rltk.soundex('pineapple') 18 | 'P514' 19 | """ 20 | 21 | utils.check_for_none(s) 22 | utils.check_for_type(str, s) 23 | 24 | s = utils.unicode_normalize(s) 25 | 26 | if len(s) == 0: 27 | raise ValueError('Empty string') 28 | 29 | s = s.upper() 30 | 31 | CODES = ( 32 | ('BFPV', '1'), 33 | ('CGJKQSXZ', '2'), 34 | ('DT', '3'), 35 | ('L', '4'), 36 | ('MN', '5'), 37 | ('R', '6'), 38 | ('AEIOUHWY', '.') # placeholder 39 | ) 40 | CODE_DICT = dict((c, replace) for chars, replace in CODES for c in chars) 41 | 42 | sdx = s[0] 43 | for i in range(1, len(s)): 44 | if s[i] not in CODE_DICT: 45 | continue 46 | 47 | code = CODE_DICT[s[i]] 48 | if code == '.': 49 | continue 50 | if s[i] == s[i - 1]: # ignore same letter 51 | continue 52 | if s[i - 1] in CODE_DICT and CODE_DICT[s[i - 1]] == code: # 'side-by-side' rule 53 | continue 54 | if s[i - 1] in ('H', 'W') and i - 2 > 0 and \ 55 | s[i - 2] in CODE_DICT and CODE_DICT[s[i - 2]] != '.': # consonant separators 56 | continue 57 | 58 | sdx += code 59 | 60 | sdx = sdx[0:4].ljust(4, '0') 61 | 62 | return sdx 63 | 64 | 65 | def soundex_similarity(s1, s2): 66 | """ 67 | soundex(s1) == soundex(s2) 68 | 69 | Args: 70 | s1 (str): Sequence. 71 | s2 (str): Sequence. 72 | 73 | Returns: 74 | float: if soundex(s1) equals to soundex(s2) 75 | """ 76 | return 1 if soundex(s1) == soundex(s2) else 0 77 | -------------------------------------------------------------------------------- /rltk/similarity/tf_idf.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import math 3 | 4 | import rltk.utils as utils 5 | 6 | 7 | def tf_idf_similarity(bag1, bag2, df_corpus, doc_size, math_log=False): 8 | """ 9 | Computes TF/IDF measure. This measure employs the notion of TF/IDF score commonly used in information retrieval (IR) to find documents that are relevant to keyword queries. The intuition underlying the TF/IDF measure is that two strings are similar if they share distinguishing terms. 10 | 11 | Note: 12 | If you will call this function many times, :meth:`TF_IDF` is more efficient. 13 | 14 | Args: 15 | bag1 (list): Bag 1. 16 | bag2 (list): Bag 2. 17 | df_corpus (dict): The pre calculated document frequency of corpus. 18 | doc_size (int): total documents used in corpus. 19 | math_log (bool, optional): Flag to indicate whether math.log() should be used in TF and IDF formulas. Defaults to False. 20 | 21 | Returns: 22 | float: TF/IDF cosine similarity. 23 | 24 | Examples: 25 | >>> rltk.tfidf(['a', 'b', 'a'], ['a', 'c'], {'a':3, 'b':1, 'c':1}, 3) 26 | 0.17541160386140586 27 | >>> rltk.tfidf(['a', 'b', 'a'], ['a', 'c'], {'a':3, 'b':2, 'c':1}, 4, True) 28 | 0.12977804138 29 | >>> rltk.tfidf(['a', 'b', 'a'], ['a'], {'a':3, 'b':1, 'c':1}, 3) 30 | 0.5547001962252291 31 | """ 32 | # http://www.tfidf.com/ 33 | 34 | utils.check_for_none(bag1, bag2, df_corpus) 35 | utils.check_for_type(list, bag1, bag2) 36 | 37 | # term frequency for input strings 38 | t_x, t_y = collections.Counter(bag1), collections.Counter(bag2) 39 | tf_x = {k: float(v) / len(bag1) for k, v in t_x.items()} 40 | tf_y = {k: float(v) / len(bag2) for k, v in t_y.items()} 41 | 42 | # unique element 43 | total_unique_elements = set() 44 | total_unique_elements.update(bag1) 45 | total_unique_elements.update(bag2) 46 | 47 | idf_element, v_x, v_y, v_x_y, v_x_2, v_y_2 = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 48 | 49 | # tfidf calculation 50 | for element in total_unique_elements: 51 | if element not in df_corpus: 52 | continue 53 | idf_element = doc_size * 1.0 / df_corpus[element] 54 | 55 | v_x = 0 if element not in tf_x else (math.log(idf_element) * tf_x[element]) if math_log else ( 56 | idf_element * tf_x[element]) 57 | v_y = 0 if element not in tf_y else (math.log(idf_element) * tf_y[element]) if math_log else ( 58 | idf_element * tf_y[element]) 59 | v_x_y += v_x * v_y 60 | v_x_2 += v_x * v_x 61 | v_y_2 += v_y * v_y 62 | 63 | # cosine similarity 64 | return 0.0 if v_x_y == 0 else v_x_y / (math.sqrt(v_x_2) * math.sqrt(v_y_2)) 65 | 66 | 67 | def compute_tf(tokens): 68 | """ 69 | Compute TF (Term Frequency) 70 | 71 | Args: 72 | tokens (list): tokens 73 | """ 74 | terms = collections.Counter(tokens) 75 | return {k: float(v) / len(tokens) for k, v in terms.items()} 76 | 77 | 78 | def compute_idf(df_corpus, doc_size, math_log=False): 79 | """ 80 | Compute IDF (Inverted Document Frequency) 81 | 82 | Args: 83 | df_corpus (dict): terms in document 84 | doc_size (int): total document size 85 | math_log (bool): logarithm of the result 86 | """ 87 | return {k: doc_size * 1.0 / v if math_log is False \ 88 | else math.log(doc_size * 1.0 / v) \ 89 | for k, v in df_corpus.items()} 90 | 91 | 92 | def tf_idf_cosine_similarity(tfidf_dict1, tfidf_dict2): 93 | """ 94 | Compute Cosine similarity for TF/IDF value dictionary 95 | 96 | Args: 97 | tfidf_dict1 (dict): TF/IDF dictionary for first record, format in ``{term1: tfidf value, ...}`` 98 | tfidf_dict2 (dict): TF/IDF dictionary for second record, same format as tfidf_dict1. 99 | 100 | Returns: 101 | float: 102 | """ 103 | v_x_y, v_x_2, v_y_2 = 0.0, 0.0, 0.0 104 | 105 | # intersection of dict1 and dict2 106 | # ignore the values that are not in both 107 | for t in tfidf_dict1.keys(): 108 | if t in tfidf_dict2: 109 | v_x_y += tfidf_dict1[t] * tfidf_dict2[t] 110 | 111 | for t, tfidf in tfidf_dict1.items(): 112 | v_x_2 += tfidf * tfidf 113 | for t, tfidf in tfidf_dict2.items(): 114 | v_y_2 += tfidf * tfidf 115 | 116 | # cosine similarity 117 | return 0.0 if v_x_y == 0 else v_x_y / (math.sqrt(v_x_2) * math.sqrt(v_y_2)) 118 | 119 | 120 | class TF_IDF(): 121 | """ 122 | TF/IDF helper class (An efficient implementation) 123 | 124 | Examples:: 125 | 126 | # initialization 127 | tfidf = TF_IDF() 128 | # add document 129 | tfidf.add_document('id1', ['a', 'b', 'a']) 130 | tfidf.add_document('id2', ['b', 'c']) 131 | tfidf.add_document('id3', ['b', 'd']) 132 | # compute idf 133 | tfidf.pre_compute() 134 | # get similarity 135 | tfidf.similarity('id1', 'id2') 136 | tfidf.similarity('id1', 'id3') 137 | """ 138 | 139 | def __init__(self): 140 | self.tf = {} 141 | self.df_corpus = {} 142 | self.doc_size = 0 143 | self.idf = 0 144 | 145 | def add_document(self, doc_id: str, tokens: list): 146 | """ 147 | Add document to corpus 148 | 149 | Args: 150 | doc_id (str): Document (record) id. 151 | tokens (list): List of token string. 152 | """ 153 | self.doc_size += 1 154 | tf = compute_tf(tokens) 155 | self.tf[doc_id] = tf 156 | for k, _ in tf.items(): 157 | self.df_corpus[k] = self.df_corpus.get(k, 0) + 1 158 | 159 | def pre_compute(self, math_log: bool = False): 160 | """ 161 | Pre-compute IDF score 162 | 163 | Args: 164 | math_log (bool, optional): Flag to indicate whether math.log() should be used in TF and IDF formulas. Defaults to False. 165 | """ 166 | self.idf = compute_idf(self.df_corpus, self.doc_size, math_log) 167 | 168 | def similarity(self, id1, id2): 169 | """ 170 | Get similarity 171 | 172 | Args: 173 | id1 (str): id 1 174 | id2 (str): id2 175 | 176 | Returns: 177 | float: 178 | """ 179 | tf_x = self.tf[id1] 180 | tfidf_x = {k: v * self.idf[k] for k, v in tf_x.items()} 181 | tf_y = self.tf[id2] 182 | tfidf_y = {k: v * self.idf[k] for k, v in tf_y.items()} 183 | return tf_idf_cosine_similarity(tfidf_x, tfidf_y) 184 | -------------------------------------------------------------------------------- /rltk/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/usc-isi-i2/rltk/aee10ed5dd561583e60db3373ed82fe1208da1e9/rltk/tests/__init__.py -------------------------------------------------------------------------------- /rltk/tests/test_blocking.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import random 3 | 4 | from rltk.record import Record 5 | from rltk.dataset import Dataset 6 | from rltk.io.reader.array_reader import ArrayReader 7 | from rltk.blocking.block_black_list import BlockBlackList 8 | from rltk.blocking.hash_block_generator import HashBlockGenerator 9 | from rltk.blocking.token_block_generator import TokenBlockGenerator 10 | from rltk.blocking.canopy_block_generator import CanopyBlockGenerator 11 | from rltk.blocking.sorted_neighbourhood_block_generator import SortedNeighbourhoodBlockGenerator 12 | 13 | 14 | class ConcreteRecord(Record): 15 | @property 16 | def id(self): 17 | return self.raw_object['id'] 18 | 19 | @property 20 | def name(self): 21 | return self.raw_object['name'] 22 | 23 | @property 24 | def category(self): 25 | return self.raw_object['category'] 26 | 27 | 28 | raw_data = [ 29 | {'id': '1', 'name': 'apple', 'category': 'a'}, 30 | {'id': '2', 'name': 'banana', 'category': 'a'}, 31 | {'id': '3', 'name': 'apple & banana', 'category': 'b'}, 32 | {'id': '4', 'name': 'pineapple', 'category': 'b'}, 33 | {'id': '5', 'name': 'peach', 'category': 'b'}, 34 | {'id': '6', 'name': 'coconut', 'category': 'b'} 35 | ] 36 | 37 | ds = Dataset(reader=ArrayReader(raw_data), record_class=ConcreteRecord) 38 | 39 | 40 | def test_hash_block_generator(): 41 | bg = HashBlockGenerator() 42 | block = bg.block(ds, property_='category') 43 | for key, set_ in block.key_set_adapter: 44 | if key == 'a': 45 | assert set_ == set([(ds.id, '1'), (ds.id, '2')]) 46 | elif key == 'b': 47 | assert set_ == set([(ds.id, '3'), (ds.id, '4'), (ds.id, '5'), (ds.id, '6')]) 48 | block = bg.block(ds, function_=lambda r: r.category) 49 | for key, set_ in block.key_set_adapter: 50 | if key == 'a': 51 | assert set_ == set([(ds.id, '1'), (ds.id, '2')]) 52 | elif key == 'b': 53 | assert set_ == set([(ds.id, '3'), (ds.id, '4'), (ds.id, '5'), (ds.id, '6')]) 54 | 55 | block_black_list = BlockBlackList(max_size=2) 56 | block = bg.block(ds, property_='category', block_black_list=block_black_list) 57 | for key, set_ in block.key_set_adapter: 58 | assert key == 'a' 59 | for key, _ in block_black_list.key_set_adapter: 60 | assert key == 'b' 61 | 62 | 63 | def test_token_block_generator(): 64 | bg = TokenBlockGenerator() 65 | block = bg.block(ds, function_=lambda r: r.name.split(' ')) 66 | for key, set_ in block.key_set_adapter: 67 | if key == 'apple': 68 | assert set_ == set([(ds.id, '1'), (ds.id, '3')]) 69 | elif key == 'banana': 70 | assert set_ == set([(ds.id, '2'), (ds.id, '3')]) 71 | 72 | block_black_list = BlockBlackList(max_size=1) 73 | block = bg.block(ds, function_=lambda r: r.name.split(' '), block_black_list=block_black_list) 74 | for key, set_ in block.key_set_adapter: 75 | assert len(set_) <= 1 76 | for key, _ in block_black_list.key_set_adapter: 77 | assert key in ('apple', 'banana') 78 | 79 | 80 | def test_canopy_block_generator(): 81 | random.seed(0) 82 | bg = CanopyBlockGenerator(t1=5, t2=1, distance_metric=lambda x, y: abs(x[0] - y[0])) 83 | block = bg.block(ds, function_=lambda r: [ord(r.name[0].lower()) - 0x61]) 84 | output_block = bg.generate(block, block) 85 | result = [ 86 | ['4', '5'], 87 | ['1', '2', '3', '6'], 88 | ['2', '6'], 89 | ['6'] 90 | ] 91 | for k, v in output_block.key_set_adapter: 92 | ids = [r[1] for r in v] 93 | assert sorted(ids) == sorted(result[k]) 94 | 95 | def test_sorted_neighbourhood_block_generator(): 96 | class SNConcreteRecord1(Record): 97 | @property 98 | def id(self): 99 | return self.raw_object['id'] 100 | 101 | @property 102 | def char(self): 103 | return self.raw_object['char'] 104 | 105 | class SNConcreteRecord2(SNConcreteRecord1): 106 | pass 107 | 108 | sn_raw_data_1 = [ 109 | {'id': '11', 'char': 'a'}, 110 | {'id': '12', 'char': 'd'}, 111 | {'id': '13', 'char': 'c'}, 112 | {'id': '14', 'char': 'e'}, 113 | ] 114 | 115 | sn_raw_data_2 = [ 116 | {'id': '21', 'char': 'b'}, 117 | {'id': '22', 'char': 'a'}, 118 | {'id': '23', 'char': 'e'}, 119 | {'id': '24', 'char': 'f'}, 120 | ] 121 | 122 | ds1 = Dataset(reader=ArrayReader(sn_raw_data_1), record_class=SNConcreteRecord1) 123 | ds2 = Dataset(reader=ArrayReader(sn_raw_data_2), record_class=SNConcreteRecord2) 124 | 125 | bg = SortedNeighbourhoodBlockGenerator(window_size=3) 126 | block = bg.generate( 127 | bg.block(ds1, property_='char'), 128 | bg.block(ds2, property_='char') 129 | ) 130 | 131 | for block_id, set_ in block.key_set_adapter: 132 | block_data = [] 133 | for did, rid in set_: 134 | if did == ds1.id: 135 | block_data.append(ds1.get_record(rid).char) 136 | else: 137 | block_data.append(ds2.get_record(rid).char) 138 | block_data.sort() 139 | for i in range(len(block_data) - 1): 140 | assert block_data[i] <= block_data[i+1] # should be less than or equal to previous char 141 | -------------------------------------------------------------------------------- /rltk/tests/test_io_adapter.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pytest 3 | import redis 4 | import tempfile 5 | import shutil 6 | 7 | from rltk.record import Record 8 | from rltk.io.adapter import * 9 | 10 | 11 | class ConcreteRecord(Record): 12 | 13 | @property 14 | def id(self): 15 | return self.raw_object['id'] 16 | 17 | @property 18 | def value(self): 19 | return self.raw_object['value'] 20 | 21 | 22 | record = ConcreteRecord(raw_object={'id': 'id1', 'value': 'value1'}) 23 | 24 | 25 | def _test_key_value_adapter(adapter): 26 | adapter.set(record.id, record) 27 | assert adapter.get(record.id).id == record.id 28 | assert adapter.get(record.id).value == record.value 29 | for rid, r in adapter: 30 | assert type(rid) == str 31 | assert rid == record.id 32 | assert r.id == record.id 33 | break 34 | 35 | assert adapter.get('no_such_key') is None 36 | adapter.clean() 37 | 38 | 39 | def test_memory_key_value_adapter(): 40 | adapter = MemoryKeyValueAdapter() 41 | _test_key_value_adapter(adapter) 42 | 43 | 44 | def test_dbm_key_value_adapter(): 45 | name = 'test_dbm_adapter' 46 | adapter = DbmKeyValueAdapter(name) 47 | _test_key_value_adapter(adapter) 48 | if os.path.exists(name + '.db'): 49 | os.remove(name + '.db') 50 | 51 | 52 | def test_redis_key_value_adapter(): 53 | try: 54 | adapter = RedisKeyValueAdapter('127.0.0.1', key_prefix='rltk_test_redis_key_value_adapter_') 55 | _test_key_value_adapter(adapter) 56 | except redis.exceptions.ConnectionError: 57 | return 58 | 59 | 60 | def _test_key_set_adapter(adapter): 61 | adapter.set('a', set(['1', '2', '3'])) 62 | assert adapter.get('a') == set(['1', '2', '3']) 63 | adapter.add('a', '4') 64 | assert adapter.get('a') == set(['1', '2', '3', '4']) 65 | adapter.remove('a', '4') 66 | assert adapter.get('a') == set(['1', '2', '3']) 67 | assert adapter.get('b') is None 68 | for k, v in adapter: 69 | assert type(k) == str 70 | assert k == 'a' 71 | assert v == set(['1', '2', '3']) 72 | break 73 | adapter.delete('a') 74 | assert adapter.get('a') is None 75 | adapter.set('c', set(['1', '2', '3'])) 76 | adapter.clean() 77 | assert adapter.get('c') is None 78 | 79 | 80 | def test_memory_key_set_adapter(): 81 | adapter = MemoryKeySetAdapter() 82 | _test_key_set_adapter(adapter) 83 | 84 | 85 | def test_leveldb_key_set_adapter(): 86 | path = os.path.join(tempfile.gettempdir(), 'rltk_test_leveldb_key_set_adapter') 87 | adapter = LevelDbKeySetAdapter(path, name='test') 88 | _test_key_set_adapter(adapter) 89 | 90 | shutil.rmtree(path) 91 | 92 | 93 | def test_redis_key_set_adapter(): 94 | try: 95 | adapter = RedisKeySetAdapter('127.0.0.1', key_prefix='rltk_test_redis_key_set_adapter_') 96 | _test_key_set_adapter(adapter) 97 | except redis.exceptions.ConnectionError: 98 | return 99 | -------------------------------------------------------------------------------- /rltk/tests/test_io_reader.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import csv 3 | import json 4 | import io 5 | 6 | from rltk.io.reader import * 7 | 8 | 9 | arr = [{'1': 'A', '2': 'B'}, {'1': 'a', '2': 'b'}] 10 | 11 | 12 | def test_array_reader(): 13 | for idx, obj in enumerate(ArrayReader(arr)): 14 | assert obj == arr[idx] 15 | 16 | 17 | def test_dataframe_reader(): 18 | df = pd.DataFrame(arr) 19 | for idx, obj in enumerate(DataFrameReader(df)): 20 | assert obj == arr[idx] 21 | 22 | 23 | def test_dataframe_reader_keep_index(): 24 | df = pd.DataFrame(arr) 25 | for idx, obj in enumerate(DataFrameReader(df, True)): 26 | assert obj == dict(**arr[idx], dataframe_default_index=idx) 27 | 28 | 29 | def test_csv_reader(): 30 | f = io.StringIO() 31 | 32 | writer = csv.DictWriter(f, fieldnames=['1', '2']) 33 | writer.writeheader() 34 | for a in arr: 35 | writer.writerow(a) 36 | 37 | for idx, obj in enumerate(CSVReader(f)): 38 | assert obj == arr[idx] 39 | 40 | f.close() 41 | 42 | 43 | def test_jsonlines_reader(): 44 | f = io.StringIO() 45 | 46 | for a in arr: 47 | f.write(json.dumps(a) + '\n') 48 | 49 | for idx, obj in enumerate(JsonLinesReader(f)): 50 | assert obj == arr[idx] 51 | 52 | f.close() -------------------------------------------------------------------------------- /rltk/tests/test_trial.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from rltk.record import Record 4 | from rltk.evaluation.ground_truth import GroundTruth 5 | from rltk.evaluation.trial import Trial 6 | from rltk.similarity import * 7 | 8 | 9 | class ConcreteRecord(Record): 10 | @property 11 | def id(self): 12 | return self.raw_object['id'] 13 | 14 | @property 15 | def data(self): 16 | return self.raw_object['data'] 17 | 18 | 19 | @pytest.mark.parametrize('ground_truth_list, cal_result_list, min_c, top_k,tp,tn,fp,fn', [ 20 | ([(1, 1, 10, 0, True), (2, 2, 11, 0, True), (3, 1, 12, 1, False), (4, 2, 13, 1, False)], 21 | [(1, 1, 10, 0, True, 0.5), (2, 2, 11, 0, False, 0.5), (3, 1, 12, 1, True, 0.5), (4, 2, 13, 1, False, 0.5)], 0, 0, 22 | 0.5, 0.5, 0.5, 23 | 0.5), 24 | # ([(1, 0, True), (2, 0, True), (1, 1, False), (2, 1, False)], 25 | # [(1, 0, True, 0.6), (2, 0, False, 0.5), (1, 1, True, 0.5), (2, 1, False, 0.6)], 0, 2, 1.0, 1.0, 0, 0) 26 | ]) 27 | def test_basic(ground_truth_list, cal_result_list, min_c, top_k, tp, tn, fp, fn): 28 | # if not isinstance(ground_truth_list, (list)) or not isinstance(cal_result_list, (list)): 29 | # with pytest.raises(ValueError): 30 | # # number_equal(n1, n2) 31 | # else: 32 | do_test_trial(ground_truth_list, cal_result_list, min_c, top_k, tp, tn, fp, fn) 33 | 34 | 35 | def do_test_trial(ground_truth_list, cal_result_list, min_c, top_k, tp, tn, fp, fn): 36 | gt = GroundTruth() 37 | 38 | for r1_id, r1_d, r2_id, r2_d, p in ground_truth_list: 39 | raw_object = {'id': r1_id, 'data': r1_d} 40 | r1 = ConcreteRecord(raw_object) 41 | raw_object = {'id': r2_id, 'data': r2_d} 42 | r2 = ConcreteRecord(raw_object) 43 | gt.add_ground_truth(r1_id, r2_id, p) 44 | 45 | trial = Trial(gt, min_c, top_k) 46 | for r1_id, r1_d, r2_id, r2_d, p, c in cal_result_list: 47 | raw_object = {'id': r1_id, 'data': r1_d} 48 | r1 = ConcreteRecord(raw_object) 49 | raw_object = {'id': r2_id, 'data': r2_d} 50 | r2 = ConcreteRecord(raw_object) 51 | trial.add_result(r1, r2, p, c) 52 | 53 | trial.evaluate() 54 | 55 | assert trial.true_positives == tp 56 | assert trial.true_negatives == tn 57 | assert trial.false_positives == fp 58 | assert trial.false_negatives == fn 59 | 60 | 61 | @pytest.mark.parametrize('ground_truth_list, min_c, top_k, similarity_info, tp, tn, fp, fn', [ 62 | ([('0', '', '10', 'abc', False), ('1', 'abc', '11', 'abc', True), ('2', 'abcd', '12', 'abc', False), 63 | ('3', 'abd', '13', 'abc', False)], 64 | 0, 0, [('levenshtein_similarity', 0.9), ('string_equal', 0.5)], 1.0, 1.0, 0, 0), 65 | ([('0', '', '10', 'abc', False), ('1', 'abc', '11', 'abc', True), ('2', 'abcd', '12', 'abc', False), 66 | ('3', 'abd', '13', 'abc', False)], 67 | 0, 2, [('levenshtein_similarity', 0.9), ('string_equal', 0.5)], 1.0, 1.0, 0, 0) 68 | ]) 69 | def test_lvl(ground_truth_list, min_c, top_k, similarity_info, tp, tn, fp, fn): 70 | gt = GroundTruth() 71 | 72 | for r1_id, r1_d, r2_id, r2_d, p in ground_truth_list: 73 | raw_object = {'id': r1_id, 'data': r1_d} 74 | r1 = ConcreteRecord(raw_object) 75 | raw_object = {'id': r2_id, 'data': r2_d} 76 | r2 = ConcreteRecord(raw_object) 77 | gt.add_ground_truth(r1_id, r2_id, p) 78 | 79 | for similarity_function, min_confidence in similarity_info: 80 | trial = Trial(gt, min_confidence=min_c, top_k=top_k) 81 | 82 | i = 0 83 | for r1_id, r1_d, r2_id, r2_d, c in ground_truth_list: 84 | raw_object = {'id': r1_id, 'data': r1_d} 85 | r1 = ConcreteRecord(raw_object) 86 | raw_object = {'id': r2_id, 'data': r2_d} 87 | r2 = ConcreteRecord(raw_object) 88 | 89 | func_info = similarity_function + '("' + r1_d + '","' + r2_d + '")' 90 | c = eval(func_info) 91 | p = (c >= min_confidence) 92 | trial.add_result(r1, r2, p, c) 93 | 94 | trial.evaluate() 95 | 96 | assert trial.true_positives == tp 97 | assert trial.true_negatives == tn 98 | assert trial.false_positives == fp 99 | assert trial.false_negatives == fn 100 | -------------------------------------------------------------------------------- /rltk/tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import List 3 | from rltk.tokenizer.crf_tokenizer import crf_tokenizer as dig_tokenizer 4 | 5 | 6 | class Tokenizer(ABC): 7 | """ 8 | Abstract tokenizer 9 | """ 10 | 11 | @abstractmethod 12 | def tokenize(self, s: str) -> List[str]: 13 | """ 14 | Apply tokenizer 15 | 16 | Args: 17 | s (str): String to tokenize. 18 | 19 | Returns: 20 | List[str]: Tokenized list. It won't do token deduplication. 21 | """ 22 | raise NotImplementedError 23 | 24 | 25 | class CRFTokenizer(Tokenizer): 26 | """ 27 | CRFTokenizer: this uses old DIG CRFTokenizer 28 | """ 29 | 30 | def __init__(self, *args, **kwargs) -> None: 31 | self._t = dig_tokenizer.CrfTokenizer(*args, **kwargs) 32 | 33 | def tokenize(self, s: str) -> List[str]: 34 | return self._t.tokenize(s) 35 | 36 | 37 | class WordTokenizer(Tokenizer): 38 | """ 39 | Word Tokenizer: tokenize word by white space 40 | 41 | Args: 42 | remove_empty (bool, optional): If set, empty token will be removed. Defaults to False. 43 | """ 44 | 45 | def __init__(self, remove_empty: bool = False) -> None: 46 | self._remove_empty = remove_empty 47 | 48 | def tokenize(self, s: str) -> List[str]: 49 | s = s.split(' ') 50 | if self._remove_empty: 51 | return list(filter(lambda x: len(x) != 0, s)) 52 | else: 53 | return s 54 | 55 | 56 | class NGramTokenizer(Tokenizer): 57 | """ 58 | NGrame Tokenizer 59 | 60 | Args: 61 | n (int): n. 62 | place_holder (str, optional): String to fill pad and separator. Defaults to white space (' '). 63 | padded (bool, optional): If set, head the tail will be filled with pad. Defaults to False. 64 | """ 65 | 66 | def __init__(self, n: int, place_holder: str = ' ', padded: bool = False, 67 | base_tokenizer: Tokenizer = None) -> None: 68 | self._n = n 69 | self._place_holder = place_holder 70 | self._padded = padded 71 | self._base_tokenizer = base_tokenizer if base_tokenizer else WordTokenizer() 72 | 73 | def tokenize(self, s: str) -> List[str]: 74 | if len(s) == 0: 75 | return [] 76 | if self._padded: 77 | pad = self._place_holder * (self._n - 1) 78 | s = pad + s + pad 79 | s = self._base_tokenizer.tokenize(s) 80 | s = self._place_holder.join(s) 81 | if len(s) < self._n: 82 | return [s] 83 | return [s[i:i + self._n] for i in range(len(s) - self._n + 1)] 84 | -------------------------------------------------------------------------------- /rltk/tokenizer/crf_tokenizer/README.md: -------------------------------------------------------------------------------- 1 | # dig-crf-tokenizer 2 | 3 | The tokenization rules take into account embedded HTML tags and 4 | entities. HTML tags begin with "<" and end with ">". The contents of a 5 | tag are treated as a single token, although internal spaces, tabs, and 6 | newlines are stripped out so as not to confuse CRF++. HTML entities 7 | begin with "&" and end with ";", with certain characters allowed 8 | inbetween. They are treated as single tokens. 9 | 10 | HTML tags and HTML entities optionally can be skipped (omitted form the 11 | output array of tokens) after recognition. 12 | 13 | There are risks to the HTML processing rules when the text being 14 | tokenized is not proper HTML. Left angle brackets can cause the 15 | following text to become a single token. Ampersands can merge into 16 | the following textual word. 17 | 18 | A possible solution to the bare ampersand problem is to recognize only 19 | the defined set of HTML entities. It is harder to think of a solution 20 | to the bare left angle bracket problem; perhaps check if they are 21 | followed by the beginning of a valid HTML tag name? 22 | 23 | There is also special provision to group contiguous punctuation characters. 24 | The way to use this tokenizer is to create an instance of it, set any 25 | processing flags you need, then call the tokenize(value) function, 26 | which will return the tokens in an array. 27 | 28 | To tokenize, breaking on punctuation without recognizing HTML tags and 29 | entities, try: 30 | ``` 31 | t = CrfTokenizer() 32 | tokens = t.tokenize(value) 33 | ``` 34 | 35 | To tokenize, breaking on punctuation and recognizing both HTML tags and 36 | entites as special tokens, try: 37 | ``` 38 | t = CrfTokenizer() 39 | t.setRecognizeHtmlEntities(True) 40 | t.setRecognizeHtmlTags(True) 41 | tokens = t.tokenize(value) 42 | ``` 43 | 44 | To tokenize, breaking on punctuation, recognizing and HTML tags and 45 | entities, and skipping the tags, try: 46 | ``` 47 | t = CrfTokenizer() 48 | t.setRecognizeHtmlEntities(True) 49 | t.setRecognizeHtmlTags(True) 50 | t.setSkipHtmlTags(True) 51 | tokens = t.tokenize(value) 52 | ``` 53 | 54 | The following sequence will tokenize, strip HTML tags, then join the tokens 55 | into a string. The final result will be the input string with HTML entities 56 | treated as single tokens, HTML tags stripped out, punctuation separated from 57 | adjacent words, and excess white space removed. 58 | ``` 59 | t = CrfTokenizer() 60 | t.setRecognizeHtmlEntities(True) 61 | t.setRecognizeHtmlTags(True) 62 | t.setSkipHtmlTags(True) 63 | result = t.tokenize(value).join(" ") 64 | ``` 65 | 66 | The same as above, but with punctuation remaining glued to adjacent words: 67 | ``` 68 | t = CrfTokenizer() 69 | t.setRecognizePunctuation(False) 70 | t.setRecognizeHtmlTags(True) 71 | t.setSkipHtmlTags(True) 72 | result = t.tokenize(value).join(" ") 73 | ``` 74 | -------------------------------------------------------------------------------- /rltk/tokenizer/crf_tokenizer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/usc-isi-i2/rltk/aee10ed5dd561583e60db3373ed82fe1208da1e9/rltk/tokenizer/crf_tokenizer/__init__.py -------------------------------------------------------------------------------- /rltk/utils.py: -------------------------------------------------------------------------------- 1 | import unicodedata 2 | import warnings 3 | 4 | from typing import TYPE_CHECKING 5 | if TYPE_CHECKING: 6 | from rltk.dataset import Dataset 7 | from rltk.blocking.block import Block 8 | from rltk.evaluation.ground_truth import GroundTruth 9 | 10 | 11 | MAX_FLOAT = float('inf') 12 | MIN_FLOAT = float('-inf') 13 | 14 | 15 | def check_for_none(*args): 16 | for arg in args: 17 | if arg is None: 18 | raise ValueError('Missing parameter') 19 | 20 | 21 | def check_for_type(type, *args): 22 | for arg in args: 23 | if not isinstance(arg, type): 24 | raise TypeError('Wrong type of parameter') 25 | 26 | 27 | def unicode_normalize(s): 28 | return unicodedata.normalize('NFKD', s) 29 | 30 | 31 | def convert_list_to_set(s): 32 | if isinstance(s, list): 33 | s = set(s) 34 | return s 35 | 36 | 37 | def candidate_pairs(dataset1: 'Dataset', 38 | dataset2: 'Dataset' = None, 39 | block: 'Block' = None, 40 | ground_truth: 'GroundTruth' = None): 41 | """ 42 | Generate candidate pairs to compare. 43 | 44 | Args: 45 | dataset1 (Dataset): dataset 1. 46 | dataset2 (Dataset, optional): dataset 2. If it's not provided, it will be a de-duplication task. 47 | block (Block, optional): Block. 48 | ground_truth (GroundTruth, optional): Ground truth. 49 | """ 50 | if block and not ground_truth: 51 | if not dataset2: 52 | for _, id1, id2 in block.pairwise(dataset1.id): 53 | yield dataset1.get_record(id1), dataset1.get_record(id2) 54 | else: 55 | for _, id1, id2 in block.pairwise(dataset1.id, dataset2.id): 56 | yield dataset1.get_record(id1), dataset2.get_record(id2) 57 | elif ground_truth and not block: 58 | if not dataset2: 59 | for id1, id2, label in ground_truth: 60 | yield dataset1.get_record(id1), dataset1.get_record(id2) 61 | else: 62 | for id1, id2, label in ground_truth: 63 | yield dataset1.get_record(id1), dataset2.get_record(id2) 64 | elif ground_truth and block: 65 | if not dataset2: 66 | for _, id1, id2 in block.pairwise(dataset1.id): 67 | if ground_truth.is_member(id1, id2): 68 | yield dataset1.get_record(id1), dataset1.get_record(id2) 69 | else: 70 | for _, id1, id2 in block.pairwise(dataset1.id, dataset2.id): 71 | if ground_truth.is_member(id1, id2): 72 | yield dataset1.get_record(id1), dataset2.get_record(id2) 73 | else: 74 | if not dataset2: 75 | skip_offset = 0 76 | for r1 in dataset1: 77 | for offset, r2 in enumerate(dataset1): 78 | if offset < skip_offset: 79 | continue 80 | if r1.id == r2.id: 81 | continue 82 | yield r1, r2 83 | skip_offset += 1 84 | else: 85 | for r1 in dataset1: 86 | for r2 in dataset2: 87 | yield r1, r2 88 | 89 | 90 | get_record_pairs = candidate_pairs 91 | 92 | 93 | class ModuleImportWarning(UserWarning): 94 | pass 95 | 96 | 97 | def module_importer(module_names: str, dependencies: str, notes: str = None): 98 | if isinstance(dependencies, str): 99 | dependencies = [dependencies] 100 | 101 | def module(): 102 | try: 103 | return __import__(module_names) 104 | except ImportError: 105 | warning_msg = '\n-----------------------------------\n' 106 | warning_msg += '\nImport Dependencies Error\n' 107 | 108 | if len(dependencies) > 0: 109 | warning_msg += '\nPlease install dependencies:\n' 110 | for d in dependencies: 111 | warning_msg += d + '\n' 112 | 113 | if notes: 114 | warning_msg += notes 115 | 116 | warning_msg += '\n-----------------------------------' 117 | warnings.warn(warning_msg, ModuleImportWarning) 118 | exit(500) 119 | 120 | return module 121 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | from setuptools import find_packages 3 | 4 | 5 | with open('rltk/__init__.py', 'r') as f: 6 | for line in f: 7 | if line.startswith('__version__'): 8 | exec(line) # fetch and create __version__ 9 | break 10 | 11 | with open('README.rst', 'r', encoding='utf-8') as f: 12 | long_description = f.read() 13 | 14 | with open('requirements.txt', 'r') as f: 15 | install_requires = list() 16 | dependency_links = list() 17 | for line in f: 18 | re = line.strip() 19 | if re: 20 | if re.startswith('git+') or re.startswith('svn+') or re.startswith('hg+'): 21 | dependency_links.append(re) 22 | else: 23 | install_requires.append(re) 24 | 25 | packages = find_packages() 26 | 27 | setup( 28 | name='rltk', 29 | version=__version__, 30 | packages=packages, 31 | url='https://github.com/usc-isi-i2/rltk', 32 | project_urls={ 33 | "Bug Tracker": "https://github.com/usc-isi-i2/rltk/issues", 34 | "Documentation": "https://rltk.readthedocs.io", 35 | "Source Code": "https://github.com/usc-isi-i2/rltk", 36 | }, 37 | license='MIT', 38 | author='USC/ISI', 39 | author_email='yixiangy@isi.edu', 40 | description='Record Linkage ToolKit', 41 | long_description=long_description, 42 | long_description_content_type='text/x-rst', 43 | include_package_data=True, 44 | install_requires=install_requires, 45 | dependency_links=dependency_links, 46 | classifiers=( 47 | "Programming Language :: Python :: 3", 48 | "Natural Language :: English", 49 | "License :: OSI Approved :: MIT License", 50 | "Operating System :: OS Independent", 51 | "Topic :: Scientific/Engineering", 52 | "Topic :: Scientific/Engineering :: Information Analysis", 53 | "Topic :: Software Development :: Libraries", 54 | "Topic :: Software Development :: Libraries :: Python Modules", 55 | "Intended Audience :: Science/Research", 56 | "Intended Audience :: Developers", 57 | "Intended Audience :: Education", 58 | "Intended Audience :: Information Technology" 59 | ) 60 | ) 61 | --------------------------------------------------------------------------------