├── .coveragerc
├── .github
└── workflows
│ ├── deploy.yml
│ └── tests.yml
├── .gitignore
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.rst
├── binder
├── README.md
├── postBuild
└── requirements.txt
├── docs
├── Makefile
├── _static
│ ├── logo-icon.png
│ ├── logo.png
│ └── style.css
├── conf.py
├── images
│ ├── overview-basic-workflow.png
│ ├── overview-blocking-tables.png
│ ├── overview-blocking-workflow.png
│ ├── overview-dataflow.png
│ ├── overview-evaluation-workflow.png
│ ├── overview-inputs.png
│ └── overview-tables.png
├── index.rst
├── installation.rst
├── mod_blocking.rst
├── mod_cli.rst
├── mod_dataset.rst
├── mod_evaluation.rst
├── mod_io.rst
├── mod_record.rst
├── mod_remote.rst
├── mod_similarity.rst
├── mod_tokenizer.rst
├── mod_utils.rst
├── modules.rst
├── overview.rst
├── real_world_example.ipynb
├── resources
│ ├── dblp.csv
│ ├── dblp_scholar_gt.csv
│ └── scholar.jl
├── scaling_and_optimization.rst
└── step_by_step.ipynb
├── examples
├── basic
│ ├── auto_record.py
│ ├── basic.py
│ ├── cached_record.py
│ ├── dedup.py
│ ├── ds1.csv
│ └── ds2.jl
├── blocking
│ ├── block_io_operations.py
│ ├── canopy.py
│ ├── ds1.csv
│ ├── ds2.jl
│ ├── generate_blocks.py
│ └── inverted_index.py
└── evaluation
│ ├── .gitignore
│ ├── __init__.py
│ ├── construct_datasets.py
│ ├── data_1.csv
│ ├── data_2.csv
│ ├── generate_negative_gt.py
│ ├── ground_truth.csv
│ ├── gt_positive_only.csv
│ └── run_evaluation.py
├── requirements.txt
├── requirements_dev.txt
├── requirements_docs.txt
├── rltk
├── __init__.py
├── __main__.py
├── blocking
│ ├── __init__.py
│ ├── _minhash_lsh.py
│ ├── block.py
│ ├── block_black_list.py
│ ├── block_generator.py
│ ├── blocking_helper.py
│ ├── canopy_block_generator.py
│ ├── hash_block_generator.py
│ ├── sorted_neighbourhood_block_generator.py
│ └── token_block_generator.py
├── cli.py
├── dataset.py
├── evaluation
│ ├── __init__.py
│ ├── evaluation.py
│ ├── ground_truth.py
│ └── trial.py
├── io
│ ├── __init__.py
│ ├── adapter
│ │ ├── __init__.py
│ │ ├── dbm_key_value_adapter.py
│ │ ├── hbase_key_value_adapter.py
│ │ ├── key_set_adapter.py
│ │ ├── key_value_adapter.py
│ │ ├── leveldb_key_set_adapter.py
│ │ ├── memory_key_set_adapter.py
│ │ ├── memory_key_value_adapter.py
│ │ ├── redis_key_set_adapter.py
│ │ └── redis_key_value_adapter.py
│ ├── io_utils.py
│ ├── reader
│ │ ├── __init__.py
│ │ ├── array_reader.py
│ │ ├── csv_reader.py
│ │ ├── dataframe_reader.py
│ │ ├── ground_truth_reader.py
│ │ ├── jsonlines_reader.py
│ │ └── reader.py
│ ├── serializer
│ │ ├── __init__.py
│ │ ├── pickle_serializer.py
│ │ └── serializer.py
│ └── writer
│ │ ├── __init__.py
│ │ ├── ground_truth_writer.py
│ │ └── writer.py
├── record.py
├── remote
│ ├── __init__.py
│ ├── remote.py
│ └── task.py
├── similarity
│ ├── __init__.py
│ ├── cosine.py
│ ├── dice.py
│ ├── distance.py
│ ├── equal.py
│ ├── hamming.py
│ ├── hybrid.py
│ ├── jaccard.py
│ ├── jaro.py
│ ├── lcs.py
│ ├── levenshtein.py
│ ├── metaphone.py
│ ├── needleman.py
│ ├── ngram.py
│ ├── nysiis.py
│ ├── qgram.py
│ ├── soundex.py
│ └── tf_idf.py
├── tests
│ ├── __init__.py
│ ├── test_blocking.py
│ ├── test_io_adapter.py
│ ├── test_io_reader.py
│ ├── test_similarity.py
│ └── test_trial.py
├── tokenizer
│ ├── __init__.py
│ └── crf_tokenizer
│ │ ├── LICENSE
│ │ ├── README.md
│ │ ├── __init__.py
│ │ └── crf_tokenizer.py
└── utils.py
└── setup.py
/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | omit = rltk/tests/*
3 |
4 |
--------------------------------------------------------------------------------
/.github/workflows/deploy.yml:
--------------------------------------------------------------------------------
1 | name: Deploy
2 | on:
3 | push:
4 | tags:
5 | - '*'
6 | jobs:
7 | deploy-to-pypi:
8 | name: Deploy to pypi
9 | runs-on: ubuntu-latest
10 | steps:
11 | - name: checkout code
12 | uses: actions/checkout@v2
13 | - name: Set up Python
14 | uses: actions/setup-python@v2
15 | with:
16 | python-version: '3.7'
17 | - name: Install dependencies
18 | run: |
19 | python -m pip install --upgrade pip
20 | pip install setuptools wheel twine
21 | - name: Build and publish
22 | env:
23 | TWINE_USERNAME: usc_isi_i2_admin
24 | TWINE_PASSWORD: ${{ secrets.PYPI }}
25 | run: |
26 | python setup.py sdist bdist_wheel
27 | twine upload dist/*
28 |
--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
1 | name: Tests
2 | on: push
3 | jobs:
4 | run-tests:
5 | name: Run pytest
6 | runs-on: ubuntu-latest
7 | strategy:
8 | matrix:
9 | python-version: [3.7, 3.8, 3.9]
10 | steps:
11 | - name: Checkout code
12 | uses: actions/checkout@v2
13 | - name: Set up Python ${{ matrix.python-version }}
14 | uses: actions/setup-python@v2
15 | with:
16 | python-version: ${{ matrix.python-version }}
17 | - name: Install dependencies
18 | run: |
19 | python -m pip install --upgrade pip
20 | pip install -r requirements.txt
21 | pip install -r requirements_dev.txt
22 | pip install -e .
23 | pip install coverage coveralls
24 | - name: Test with pytest
25 | run: |
26 | python -m pytest -v --color=yes --cov rltk rltk/tests/test_*
27 | - name: Coverage
28 | env:
29 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
30 | run: |
31 | coveralls --service=github
32 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 |
27 | # PyInstaller
28 | # Usually these files are written by a python script from a template
29 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 |
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 |
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 |
48 | # Translations
49 | *.mo
50 | *.pot
51 |
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 |
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 |
60 | # Scrapy stuff:
61 | .scrapy
62 |
63 | # Sphinx documentation
64 | docs/_build/
65 |
66 | # PyBuilder
67 | target/
68 |
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 |
72 | # pyenv
73 | .python-version
74 |
75 | # celery beat schedule file
76 | celerybeat-schedule
77 |
78 | # dotenv
79 | .env
80 |
81 | # virtualenv
82 | venv/
83 | ENV/
84 |
85 | # Spyder project settings
86 | .spyderproject
87 |
88 | # Rope project settings
89 | .ropeproject
90 |
91 | .idea/
92 | dev_test/
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2017 University of Southern California
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | include VERSION
3 | include requirements.txt
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | .PHONY: docs
2 |
3 | docs:
4 | @cd docs && make html
5 |
6 | run-docs:
7 | @cd docs/_build/html && python -m http.server 8080 --bind localhost
8 |
9 | release:
10 | @VERSION=$$(python -c "import rltk;print(rltk.__version__)") && git tag $$VERSION
11 |
--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | RLTK: Record Linkage ToolKit
2 | ============================
3 |
4 | .. begin-intro
5 | .. image:: https://img.shields.io/badge/license-MIT-blue.svg
6 | :target: https://raw.githubusercontent.com/usc-isi-i2/rltk/master/LICENSE
7 | :alt: License
8 |
9 | .. image:: https://github.com/usc-isi-i2/rltk/workflows/Tests/badge.svg?branch=master
10 | :target: https://github.com/usc-isi-i2/rltk/actions
11 | :alt: Github actions
12 |
13 | .. image:: https://coveralls.io/repos/github/usc-isi-i2/rltk/badge.svg?branch=master
14 | :target: https://coveralls.io/github/usc-isi-i2/rltk?branch=master
15 | :alt: Coveralls
16 |
17 | .. image:: https://badge.fury.io/py/rltk.svg
18 | :target: https://badge.fury.io/py/rltk
19 | :alt: pypi
20 |
21 | .. image:: https://readthedocs.org/projects/rltk/badge/?version=latest
22 | :target: http://rltk.readthedocs.io/en/latest
23 | :alt: Documents
24 |
25 | The Record Linkage ToolKit (RLTK) is a general-purpose open-source record linkage platform that allows users to build powerful Python programs that link records referring to the same underlying entity. Record linkage is an extremely important problem that shows up in domains extending from social networks to bibliographic data and biomedicine. Current open platforms for record linkage have problems scaling even to moderately sized datasets, or are just not easy to use (even by experts). RLTK attempts to address all of these issues.
26 |
27 | RLTK supports a full, scalable record linkage pipeline, including multi-core algorithms for blocking, profiling data, computing a wide variety of features, and training and applying machine learning classifiers based on Python’s sklearn library. An end-to-end RLTK pipeline can be jump-started with only a few lines of code. However, RLTK is also designed to be extensible and customizable, allowing users arbitrary degrees of control over many of the individual components. You can add new features to RLTK (e.g. a custom string similarity) very easily.
28 |
29 | RLTK is being built by the `Center on Knowledge Graphs `_ at `USC/ISI `_, with funding from multiple projects funded by the DARPA LORELEI and MEMEX programs and the IARPA CAUSE program.
30 | RLTK is under active maintenance and we expect to keep adding new features and state-of-the-art record linkage algorithms in the foreseeable future, in addition to continuously supporting our adopters to integrate the platform into their applications.
31 |
32 | Getting Started
33 | ---------------
34 |
35 | Installation (make sure prerequisites are installed)::
36 |
37 | pip install -U rltk
38 |
39 | Example::
40 |
41 | >>> import rltk
42 | >>> rltk.levenshtein_distance('abc', 'abd')
43 | 1
44 |
45 | Try RLTK Online
46 | ---------------
47 |
48 | * `Stable version `_
49 | * `Development version `_
50 |
51 | .. end-intro
52 |
53 | Datasets & Experiments
54 | ----------------------
55 | * `rltk-experimentation `_
56 |
57 | Documentation
58 | -------------
59 |
60 | * `Tutorials `_
61 | * `API Reference `_
62 |
--------------------------------------------------------------------------------
/binder/README.md:
--------------------------------------------------------------------------------
1 | # RLTK Jupyter Binder
2 |
3 | This folder is used by [Binder](https://mybinder.org/).
4 |
--------------------------------------------------------------------------------
/binder/postBuild:
--------------------------------------------------------------------------------
1 | git clone --depth 1 -b master https://github.com/usc-isi-i2/rltk-experimentation
2 |
--------------------------------------------------------------------------------
/binder/requirements.txt:
--------------------------------------------------------------------------------
1 | scikit-learn
2 | -r ../requirements.txt
3 | -e .
4 |
5 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line.
5 | SPHINXOPTS =
6 | SPHINXBUILD = sphinx-build
7 | SPHINXPROJ = rltk
8 | SOURCEDIR = .
9 | BUILDDIR = _build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--------------------------------------------------------------------------------
/docs/_static/logo-icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/usc-isi-i2/rltk/aee10ed5dd561583e60db3373ed82fe1208da1e9/docs/_static/logo-icon.png
--------------------------------------------------------------------------------
/docs/_static/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/usc-isi-i2/rltk/aee10ed5dd561583e60db3373ed82fe1208da1e9/docs/_static/logo.png
--------------------------------------------------------------------------------
/docs/_static/style.css:
--------------------------------------------------------------------------------
1 | @import url("https://fonts.googleapis.com/css?family=Ubuntu+Mono");
2 | @import url("https://fonts.googleapis.com/css?family=Open+Sans");
3 |
4 | pre, code {
5 | font-family: "Ubuntu Mono", "Consolas", "Menlo", "DejaVu Sans Mono", "Bitstream Vera Sans Mono", monospace;
6 | font-size: 15px;
7 | }
8 |
9 | h1, h2, h3, h4, h5, h6, p.admonition-title, div.sphinxsidebar input, body {
10 | font-family: "Open Sans", "Helvetica", "Arial", sans-serif;
11 | }
12 |
13 | div.sphinxsidebar ul li.toctree-l1 > a {
14 | font-size: 100%;
15 | }
16 |
17 | div.sphinxsidebar ul li.toctree-l2 > a {
18 | font-size: 100%;
19 | }
20 |
21 | div.sphinxsidebar ul li.toctree-l3 > a {
22 | font-size: 100%;
23 | }
24 |
25 | div.body {
26 | max-width: 100%; /* overwrite basic.css */
27 | }
28 |
29 | table.dataframe {
30 | border-collapse: collapse;
31 | /*width: 100%;*/
32 | }
33 |
34 | table.dataframe th, table.dataframe td {
35 | text-align: left;
36 | padding: 8px;
37 | }
38 |
39 | table.dataframe tr:nth-child(even) {
40 | background-color: #f2f2f2;
41 | }
42 |
43 | blockquote {
44 | border-left: 5px solid #eeeeee;
45 | padding: 10px 20px;
46 | }
47 |
48 | div.sphinxsidebarwrapper p.logo {
49 | margin-bottom: 30px;
50 | }
--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #
3 | # rltk documentation build configuration file, created by
4 | # sphinx-quickstart on Thu Feb 23 13:46:31 2017.
5 | #
6 | # This file is execfile()d with the current directory set to its
7 | # containing dir.
8 | #
9 | # Note that not all possible configuration values are present in this
10 | # autogenerated file.
11 | #
12 | # All configuration values have a default; values that are commented out
13 | # serve to show the default.
14 |
15 | # If extensions (or modules to document with autodoc) are in another directory,
16 | # add these directories to sys.path here. If the directory is relative to the
17 | # documentation root, use os.path.abspath to make it absolute, like shown here.
18 | #
19 | import os
20 | import sys
21 | import datetime
22 | sys.path.insert(0, os.path.abspath('../rltk'))
23 | sys.path.insert(0, os.path.abspath('../'))
24 |
25 |
26 | # -- General configuration ------------------------------------------------
27 |
28 | # If your documentation needs a minimal Sphinx version, state it here.
29 | #
30 | # needs_sphinx = '1.0'
31 |
32 | # Add any Sphinx extension module names here, as strings. They can be
33 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
34 | # ones.
35 | extensions = ['sphinx.ext.autodoc', 'sphinxcontrib.napoleon', 'nbsphinx',
36 | 'IPython.sphinxext.ipython_console_highlighting'] # 'sphinx.ext.viewcode'
37 |
38 | # Add any paths that contain templates here, relative to this directory.
39 | templates_path = ['_templates']
40 |
41 | # The suffix(es) of source filenames.
42 | # You can specify multiple suffix as a list of string:
43 | #
44 | # source_suffix = ['.rst', '.md']
45 | source_suffix = '.rst'
46 |
47 | # The master toctree document.
48 | master_doc = 'index'
49 |
50 | # General information about the project.
51 | project = 'RLTK'
52 | copyright = '{}, USC/ISI'.format(datetime.datetime.now().year)
53 | author = 'USC/ISI'
54 |
55 | # The version info for the project you're documenting, acts as replacement for
56 | # |version| and |release|, also used in various other places throughout the
57 | # built documents.
58 | #
59 | with open('../rltk/__init__.py', 'r') as f:
60 | for line in f:
61 | if line.startswith('__version__'):
62 | exec(line) # fetch and create __version__
63 | break
64 | rltk_version = __version__
65 | # The short X.Y version.
66 | version = '.'.join(rltk_version.split('.')[:2])
67 | # The full version, including alpha/beta/rc tags.
68 | release = rltk_version
69 |
70 | # The language for content autogenerated by Sphinx. Refer to documentation
71 | # for a list of supported languages.
72 | #
73 | # This is also used if you do content translation via gettext catalogs.
74 | # Usually you set "language" from the command line for these cases.
75 | language = None
76 |
77 | # List of patterns, relative to source directory, that match files and
78 | # directories to ignore when looking for source files.
79 | # This patterns also effect to html_static_path and html_extra_path
80 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', '**.ipynb_checkpoints']
81 |
82 | # The name of the Pygments (syntax highlighting) style to use.
83 | pygments_style = 'sphinx'
84 |
85 | # If true, `todo` and `todoList` produce output, else they produce nothing.
86 | todo_include_todos = False
87 |
88 |
89 | # -- Options for HTML output ----------------------------------------------
90 |
91 | # The theme to use for HTML and HTML Help pages. See the documentation for
92 | # a list of builtin themes.
93 | #
94 | html_theme = 'alabaster' # default, alabaster, pyramid, bizstyle
95 |
96 | # Theme options are theme-specific and customize the look and feel of a theme
97 | # further. For a list of options available for each theme, see the
98 | # documentation.
99 | #
100 | html_theme_options = {
101 | 'logo': 'logo-icon.png',
102 | 'page_width': '1380px',
103 | 'sidebar_width': '220px',
104 | 'github_user': 'usc-isi-i2',
105 | 'github_repo': 'rltk',
106 | 'github_banner': 'true',
107 | 'github_type': 'star',
108 | 'extra_nav_links': {
109 | 'RLTK @ GitHub': 'https://github.com/usc-isi-i2/rltk',
110 | 'RLTK @ PyPI': 'https://pypi.org/project/rltk',
111 | 'Issue Tracker': 'https://github.com/usc-isi-i2/rltk/issues',
112 | 'USC/ISI CKG': 'http://usc-isi-i2.github.io/'
113 | },
114 | 'show_powered_by': False
115 | }
116 |
117 | html_show_sourcelink = False
118 |
119 | html_sidebars = {
120 | '**': [
121 | 'about.html',
122 | 'localtoc.html',
123 | 'navigation.html',
124 | # 'relations.html',
125 | 'searchbox.html',
126 | # 'donate.html',
127 | ]
128 | }
129 |
130 | # Add any paths that contain custom static files (such as style sheets) here,
131 | # relative to this directory. They are copied after the builtin static files,
132 | # so a file named "default.css" will overwrite the builtin "default.css".
133 | html_static_path = ['_static']
134 |
135 |
136 | # -- Options for HTMLHelp output ------------------------------------------
137 |
138 | # Output file base name for HTML help builder.
139 | htmlhelp_basename = 'rltkdoc'
140 |
141 |
142 | # -- Options for LaTeX output ---------------------------------------------
143 |
144 | latex_elements = {
145 | # The paper size ('letterpaper' or 'a4paper').
146 | #
147 | # 'papersize': 'letterpaper',
148 |
149 | # The font size ('10pt', '11pt' or '12pt').
150 | #
151 | # 'pointsize': '10pt',
152 |
153 | # Additional stuff for the LaTeX preamble.
154 | #
155 | # 'preamble': '',
156 |
157 | # Latex figure (float) alignment
158 | #
159 | # 'figure_align': 'htbp',
160 | }
161 |
162 | # Grouping the document tree into LaTeX files. List of tuples
163 | # (source start file, target name, title,
164 | # author, documentclass [howto, manual, or own class]).
165 | latex_documents = [
166 | (master_doc, 'rltk.tex', 'RLTK Documentation',
167 | u'USC/ISI', 'manual'),
168 | ]
169 |
170 |
171 | # -- Options for manual page output ---------------------------------------
172 |
173 | # One entry per manual page. List of tuples
174 | # (source start file, name, description, authors, manual section).
175 | man_pages = [
176 | (master_doc, 'rltk', 'RLTK Documentation',
177 | [author], 1)
178 | ]
179 |
180 |
181 | # -- Options for Texinfo output -------------------------------------------
182 |
183 | # Grouping the document tree into Texinfo files. List of tuples
184 | # (source start file, target name, title, author,
185 | # dir menu entry, description, category)
186 | texinfo_documents = [
187 | (master_doc, 'rltk', 'RLTK Documentation',
188 | author, 'rltk', 'Record Linkage ToolKit',
189 | 'Miscellaneous'),
190 | ]
191 |
192 |
193 | def setup(app):
194 | app.add_stylesheet('style.css')
195 |
--------------------------------------------------------------------------------
/docs/images/overview-basic-workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/usc-isi-i2/rltk/aee10ed5dd561583e60db3373ed82fe1208da1e9/docs/images/overview-basic-workflow.png
--------------------------------------------------------------------------------
/docs/images/overview-blocking-tables.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/usc-isi-i2/rltk/aee10ed5dd561583e60db3373ed82fe1208da1e9/docs/images/overview-blocking-tables.png
--------------------------------------------------------------------------------
/docs/images/overview-blocking-workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/usc-isi-i2/rltk/aee10ed5dd561583e60db3373ed82fe1208da1e9/docs/images/overview-blocking-workflow.png
--------------------------------------------------------------------------------
/docs/images/overview-dataflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/usc-isi-i2/rltk/aee10ed5dd561583e60db3373ed82fe1208da1e9/docs/images/overview-dataflow.png
--------------------------------------------------------------------------------
/docs/images/overview-evaluation-workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/usc-isi-i2/rltk/aee10ed5dd561583e60db3373ed82fe1208da1e9/docs/images/overview-evaluation-workflow.png
--------------------------------------------------------------------------------
/docs/images/overview-inputs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/usc-isi-i2/rltk/aee10ed5dd561583e60db3373ed82fe1208da1e9/docs/images/overview-inputs.png
--------------------------------------------------------------------------------
/docs/images/overview-tables.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/usc-isi-i2/rltk/aee10ed5dd561583e60db3373ed82fe1208da1e9/docs/images/overview-tables.png
--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
1 | .. rltk documentation master file, created by
2 | sphinx-quickstart on Thu Feb 23 13:46:31 2017.
3 | You can adapt this file completely to your liking, but it should at least
4 | contain the root `toctree` directive.
5 |
6 | .. include:: ./../README.rst
7 | :start-after: begin-intro
8 | :end-before: end-intro
9 |
10 |
11 | Tutorial
12 | -------------
13 |
14 | .. toctree::
15 | :maxdepth: 2
16 |
17 | installation.rst
18 | overview.rst
19 | step_by_step.ipynb
20 | real_world_example.ipynb
21 | scaling_and_optimization.rst
22 |
23 |
24 | API Reference
25 | -------------
26 |
27 | .. toctree::
28 | :maxdepth: 3
29 |
30 | modules.rst
31 |
--------------------------------------------------------------------------------
/docs/installation.rst:
--------------------------------------------------------------------------------
1 | Installation
2 | ============
3 |
4 | .. note::
5 |
6 | RLTK only supports Python 3 and it's tested under Python 3.7+.
7 |
8 |
9 | pip
10 | ----
11 |
12 | Using pip to install::
13 |
14 | pip install rltk
15 |
16 | If you want to update RLTK::
17 |
18 | pip install -U rltk
19 |
20 | Generally, it's recommended to install packages in a virtual environment::
21 |
22 | virtualenv rltk_env
23 | source activate rltk_env
24 | pip install rltk
25 |
26 |
27 | Install from source
28 | -------------------
29 |
30 | The other way to install RLTK is to clone from GitHub repository and build it from source::
31 |
32 | git clone https://github.com/usc-isi-i2/rltk.git
33 | cd rltk
34 |
35 | virtualenv rltk_env
36 | source activate rltk_env
37 | pip install -e .
38 |
39 | Run tests
40 | ---------
41 |
42 | RLTK uses `pytest `_ for unit tests. To run them, simply do following command from the root of rltk package::
43 |
44 | pytest
45 |
46 | If you need more detailed information, do::
47 |
48 | pytest -v --color=yes
49 |
50 | Build documentation
51 | -------------------
52 |
53 | Additional dependencies for building documentation should be installed first:
54 |
55 | pip install -r requirements_docs.txt
56 |
57 | Documentation is powered by `Sphinx `_ , to generate it on your local, please run::
58 |
59 | cd docs
60 | make html # the generated doc is located at _build/html/index.html
61 |
--------------------------------------------------------------------------------
/docs/mod_blocking.rst:
--------------------------------------------------------------------------------
1 | Blocking
2 | ========
3 |
4 | Block
5 | -----
6 |
7 | .. automodule:: rltk.blocking.block
8 | :members:
9 | :special-members:
10 | :exclude-members: __dict__, __weakref__, __init__
11 |
12 |
13 | Block Black List
14 | ----------------
15 |
16 | .. automodule:: rltk.blocking.block_black_list
17 | :members:
18 | :special-members:
19 | :exclude-members: __dict__, __weakref__, __init__
20 |
21 | Block Generator
22 | ---------------
23 |
24 | .. automodule:: rltk.blocking.block_generator
25 | :members:
26 | :special-members:
27 | :exclude-members: __dict__, __weakref__, __init__
28 |
29 | .. automodule:: rltk.blocking.hash_block_generator
30 | :members:
31 | :special-members:
32 | :exclude-members: __dict__, __weakref__, __init__
33 |
34 | .. automodule:: rltk.blocking.token_block_generator
35 | :members:
36 | :special-members:
37 | :exclude-members: __dict__, __weakref__, __init__
38 |
39 | .. automodule:: rltk.blocking.sorted_neighbourhood_block_generator
40 | :members:
41 | :special-members:
42 | :exclude-members: __dict__, __weakref__, __init__
43 |
44 | .. automodule:: rltk.blocking.canopy_block_generator
45 | :members:
46 | :special-members:
47 | :exclude-members: __dict__, __weakref__, __init__
48 |
49 | Blocking Helper
50 | ---------------
51 |
52 | .. automodule:: rltk.blocking.blocking_helper
53 | :members:
54 | :special-members:
55 | :exclude-members: __dict__, __weakref__, __init__
56 |
--------------------------------------------------------------------------------
/docs/mod_cli.rst:
--------------------------------------------------------------------------------
1 | Command line interface (CLI)
2 | ----------------------------
3 |
4 | .. automodule:: rltk.cli
5 | :members:
6 |
--------------------------------------------------------------------------------
/docs/mod_dataset.rst:
--------------------------------------------------------------------------------
1 | Dataset
2 | =======
3 |
4 | .. automodule:: rltk.dataset
5 | :members:
6 | :special-members:
7 | :exclude-members: __dict__, __weakref__, __init__
--------------------------------------------------------------------------------
/docs/mod_evaluation.rst:
--------------------------------------------------------------------------------
1 | Evaluation
2 | ==========
3 |
4 | GroundTruth
5 | -----------
6 |
7 | .. automodule:: rltk.evaluation.ground_truth
8 | :members:
9 | :special-members:
10 | :exclude-members: __dict__, __weakref__, __init__
11 |
12 | Trial
13 | -----
14 |
15 | .. automodule:: rltk.evaluation.trial
16 | :members:
17 | :special-members:
18 | :exclude-members: __dict__, __weakref__, __init__
19 |
20 | Evaluation
21 | ----------
22 |
23 | .. automodule:: rltk.evaluation.evaluation
24 | :members:
25 | :special-members:
26 | :exclude-members: __dict__, __weakref__, __init__
27 |
--------------------------------------------------------------------------------
/docs/mod_io.rst:
--------------------------------------------------------------------------------
1 | IO (Input & Output)
2 | ===================
3 |
4 | Reader
5 | ------
6 |
7 | Generic Reader
8 | ^^^^^^^^^^^^^^
9 |
10 | .. automodule:: rltk.io.reader.reader
11 | :members:
12 | :special-members:
13 | :exclude-members: __dict__, __weakref__, __init__
14 |
15 | .. automodule:: rltk.io.reader.array_reader
16 | :members:
17 | :special-members:
18 | :exclude-members: __dict__, __weakref__, __init__
19 |
20 | .. automodule:: rltk.io.reader.jsonlines_reader
21 | :members:
22 | :special-members:
23 | :exclude-members: __dict__, __weakref__, __init__
24 |
25 | .. automodule:: rltk.io.reader.csv_reader
26 | :members:
27 | :special-members:
28 | :exclude-members: __dict__, __weakref__, __init__
29 |
30 | .. automodule:: rltk.io.reader.dataframe_reader
31 | :members:
32 | :special-members:
33 | :exclude-members: __dict__, __weakref__, __init__
34 |
35 | GroundTruth Reader
36 | ^^^^^^^^^^^^^^^^^^
37 |
38 | .. automodule:: rltk.io.reader.ground_truth_reader
39 | :members:
40 | :special-members:
41 | :exclude-members: __dict__, __weakref__, __init__
42 |
43 | Writer
44 | ------
45 |
46 | Generic Writer
47 | ^^^^^^^^^^^^^^
48 |
49 | .. automodule:: rltk.io.writer.writer
50 | :members:
51 | :special-members:
52 | :exclude-members: __dict__, __weakref__, __init__
53 |
54 | GroundTruth Writer
55 | ^^^^^^^^^^^^^^^^^^
56 |
57 | .. automodule:: rltk.io.writer.ground_truth_writer
58 | :members:
59 | :special-members:
60 | :exclude-members: __dict__, __weakref__, __init__
61 |
62 | Adapter
63 | -------
64 |
65 | Key Value Adapter
66 | ^^^^^^^^^^^^^^^^^
67 |
68 | .. automodule:: rltk.io.adapter.key_value_adapter
69 | :members:
70 | :special-members:
71 | :exclude-members: __dict__, __weakref__, __init__
72 |
73 | .. automodule:: rltk.io.adapter.memory_key_value_adapter
74 | :members:
75 | :special-members:
76 | :exclude-members: __dict__, __weakref__, __init__
77 |
78 | .. automodule:: rltk.io.adapter.redis_key_value_adapter
79 | :members:
80 | :special-members:
81 | :exclude-members: __dict__, __weakref__, __init__
82 |
83 | .. automodule:: rltk.io.adapter.hbase_key_value_adapter
84 | :members:
85 | :special-members:
86 | :exclude-members: __dict__, __weakref__, __init__
87 |
88 | .. automodule:: rltk.io.adapter.dbm_key_value_adapter
89 | :members:
90 | :special-members:
91 | :exclude-members: __dict__, __weakref__, __init__
92 |
93 | Key Set Adapter
94 | ^^^^^^^^^^^^^^^
95 |
96 | .. automodule:: rltk.io.adapter.key_set_adapter
97 | :members:
98 | :special-members:
99 | :exclude-members: __dict__, __weakref__, __init__
100 |
101 | .. automodule:: rltk.io.adapter.memory_key_set_adapter
102 | :members:
103 | :special-members:
104 | :exclude-members: __dict__, __weakref__, __init__
105 |
106 | .. automodule:: rltk.io.adapter.leveldb_key_set_adapter
107 | :members:
108 | :special-members:
109 | :exclude-members: __dict__, __weakref__, __init__
110 |
111 | .. automodule:: rltk.io.adapter.redis_key_set_adapter
112 | :members:
113 | :special-members:
114 | :exclude-members: __dict__, __weakref__, __init__
115 |
116 | Serializer
117 | ----------
118 |
119 | .. automodule:: rltk.io.serializer.serializer
120 | :members:
121 | :special-members:
122 | :exclude-members: __dict__, __weakref__, __init__
123 |
124 | .. automodule:: rltk.io.serializer.pickle_serializer
125 | :members:
126 | :special-members:
127 | :exclude-members: __dict__, __weakref__, __init__
128 |
129 | Utilities
130 | ---------
131 |
132 | .. automodule:: rltk.io.io_utils
133 | :members:
134 | :special-members:
135 | :exclude-members: __dict__, __weakref__, __init__
136 |
--------------------------------------------------------------------------------
/docs/mod_record.rst:
--------------------------------------------------------------------------------
1 | Record
2 | ======
3 |
4 | .. automodule:: rltk.record
5 | :members:
6 | :special-members:
7 | :exclude-members: __dict__, __weakref__, __init__
--------------------------------------------------------------------------------
/docs/mod_remote.rst:
--------------------------------------------------------------------------------
1 | Remote
2 | ======
3 |
4 | RLTK's remote module is based on `Dask's distributed `_. It has `scheduler` which coordinates the actions of several `worker` s spread across multiple machines and the concurrent requests of several clients.
5 |
6 | To start scheduler, do:
7 |
8 | .. code-block:: bash
9 |
10 | python -m rltk remote.scheduler --port
11 |
12 | Then on worker machines, do
13 |
14 | .. code-block:: bash
15 |
16 | python -m rltk remote.worker : --nprocs
17 |
18 | Authentication is supported through Privacy Enhanced Mail (PEM) files. You can either get them from CA (Certificate Authority) or generate self-signed PEM locally. Here's an example of generating PEM by using `OpenSSL `_:
19 |
20 | .. code-block:: bash
21 |
22 | openssl req -newkey rsa:2048 -new -nodes -x509 -days 3650 -keyout key.pem -out cert.pem
23 |
24 | Then provide these PEM files while starting scheduler and workers. If you don't have CA certificate, set `tls-ca-file` same to `tls-cert`.
25 |
26 | .. code-block:: bash
27 |
28 | # scheduler
29 | python -m rltk remote.scheduler --port --tls-ca-file cert.pem --tls-cert cert.pem --tls-key key.pem
30 |
31 | # worker, specify protocol TLS in scheduler's address
32 | python -m rltk remote.worker tls://: --tls-ca-file cert.pem --tls-cert cert.pem --tls-key key.pem
33 |
34 | Dask provides a web UI to monitor scheduler and worker status, detailed usage can be found `here `_.
35 |
36 | Remote
37 | ------
38 |
39 | .. automodule:: rltk.remote.remote
40 | :members:
41 | :special-members:
42 | :exclude-members: __dict__, __weakref__, __init__
43 |
44 | Task
45 | ----
46 |
47 | .. automodule:: rltk.remote.task
48 | :members:
49 | :special-members:
50 | :exclude-members: __dict__, __weakref__, __init__
--------------------------------------------------------------------------------
/docs/mod_similarity.rst:
--------------------------------------------------------------------------------
1 | Similarity
2 | ==========
3 |
4 | Normal metrics
5 | --------------
6 |
7 | .. automodule:: rltk.similarity.equal
8 | :members:
9 |
10 | .. automodule:: rltk.similarity.hamming
11 | :members:
12 |
13 | .. automodule:: rltk.similarity.dice
14 | :members:
15 |
16 | .. automodule:: rltk.similarity.levenshtein
17 | :members:
18 |
19 | .. automodule:: rltk.similarity.needleman
20 | :members:
21 |
22 | .. automodule:: rltk.similarity.jaro
23 | :members:
24 |
25 | .. automodule:: rltk.similarity.jaccard
26 | :members:
27 |
28 | .. automodule:: rltk.similarity.cosine
29 | :members:
30 |
31 | .. automodule:: rltk.similarity.tf_idf
32 | :members:
33 |
34 |
35 | Hybrid metrics
36 | --------------
37 |
38 | .. automodule:: rltk.similarity.hybrid
39 | :members:
40 |
41 |
42 | Phonetic metrics
43 | ----------------
44 |
45 | .. automodule:: rltk.similarity.soundex
46 | :members:
47 |
48 | .. automodule:: rltk.similarity.metaphone
49 | :members:
50 |
51 | .. automodule:: rltk.similarity.nysiis
52 | :members:
53 |
--------------------------------------------------------------------------------
/docs/mod_tokenizer.rst:
--------------------------------------------------------------------------------
1 | Tokenizer
2 | =========
3 |
4 | .. automodule:: rltk.tokenizer
5 | :members:
6 | :special-members:
7 | :exclude-members: __dict__, __weakref__, __init__
--------------------------------------------------------------------------------
/docs/mod_utils.rst:
--------------------------------------------------------------------------------
1 | Utilities
2 | ---------
3 |
4 | .. automodule:: rltk.utils
5 | :members:
6 |
--------------------------------------------------------------------------------
/docs/modules.rst:
--------------------------------------------------------------------------------
1 | API Reference
2 | =================
3 |
4 | .. toctree::
5 | :maxdepth: 4
6 |
7 | mod_dataset.rst
8 | mod_record.rst
9 | mod_similarity.rst
10 | mod_evaluation.rst
11 | mod_blocking.rst
12 | mod_io.rst
13 | mod_utils.rst
14 | mod_cli.rst
15 | mod_remote.rst
16 | mod_tokenizer.rst
17 |
--------------------------------------------------------------------------------
/docs/resources/dblp_scholar_gt.csv:
--------------------------------------------------------------------------------
1 | idDBLP,idScholar
2 | conf/vldb/RothS97,gIFZWp_iCmUJ
3 | conf/sigmod/HellersteinHW97,7dcgPQxHuCMJ
4 | journals/tods/StolboushkinT98,ZnWLup8HMkUJ
5 | journals/sigmod/BohmR94,wLNJcNvsulkJ
6 | journals/vldb/Sarawagi01,Xx6kw0tCeQIJ
7 | conf/sigmod/FernandezFKLS97,ek26aiEheesJ
8 | journals/sigmod/Libkin99,zSO1Y5W7WkwJ
9 | journals/sigmod/KappelR98,tbZ0J3HLI18J
10 | conf/sigmod/TatarinovVBSSZ02,0HlMHEPJRH4J
11 | conf/vldb/SistlaYH94,_jl3bN2QlE4J
12 | conf/vldb/PetrovicBJ03,nIuz3dc8yHMJ
13 | journals/sigmod/FlorescuLM98,zkbTv93Zp1UJ
14 | conf/sigmod/AdaliCPS96,rmtEGXAXHKIJ
15 | journals/tods/FranklinCL97,-iaSLKFHwUkJ
16 | conf/sigmod/MamoulisP99,RusJdYPDgQ4J
17 | conf/sigmod/CherniackZ96,c9Humx2-EMgJ
18 | conf/vldb/DarFJST96,xhle0bk7qsMJ
19 | journals/sigmod/Yang94,sHJ914nPZtUJ
20 | conf/sigmod/TatarinovIHW01,jfkafZcMjgIJ
21 | conf/sigmod/BreunigKKS01,LxyVmHubIfUJ
22 | conf/sigmod/MelnikRB03,wfjeWtEY2NcJ
23 | conf/sigmod/HernandezMHYHT01,wMa4fMryrt0J
24 | journals/vldb/BarbaraI95,bTYTn8VG5hIJ
25 | conf/vldb/MedianoCD94,qwjRkZuiMHsJ
26 | conf/vldb/DattaDTVRF01,rDObsYKVroMJ
27 | journals/sigmod/AtkinsonDJPS96,yfjKkIhvfXcJ
28 | conf/vldb/SrivastavaDJL96,WvvY_Ao19mAJ
29 | conf/vldb/DattaDTVRF01,8_5A88ESaQ0J
30 | journals/vldb/VerykiosME03,khdP4spNnPoJ
31 | conf/sigmod/SimmenSM96,XVP8s4K0Bg4J
32 | conf/sigmod/ZhangDWEMPMDR03,1hkVjoUg8hUJ
33 | journals/sigmod/AshishK97,3zbpGI3YqXUJ
34 | conf/vldb/RohmBSS02,YMcmy4FOXi8J
35 | journals/tods/GoldmanL94,VxMarpzwtzQJ
36 | conf/vldb/DeutschPT99,0aJOXauNqYIJ
37 | conf/sigmod/AdelbergGW97,XytbGy--8LMJ
38 | conf/vldb/RaghavanG01,9Wo54Wyh_X8J
39 | conf/vldb/AmbiteKKMST01,F4DtzxvVZnoJ
40 | journals/tods/LitwinN96,l0W27c1C3NwJ
41 | journals/sigmod/DogacDKOONEHAKKM95,oAO74aolStoJ
42 | journals/sigmod/DiazJPa94,sG7PCEiN2xAJ
43 | conf/sigmod/MogiK96,G-ggEZEKjT8J
44 | conf/sigmod/HanKS97,XFCkL9QhTjIJ
45 | conf/vldb/Raghavan96,zuVOWDbv0lsJ
46 | conf/sigmod/LitwinS00,Ad5NAPgWIIUJ
47 | conf/sigmod/ZhouWGGZWXYF03,AwFxLUGiceUJ
48 | conf/vldb/Brin95,jXvsW6VxbMYJ
49 | conf/vldb/ShanmugasundaramGTZDN99,kaq5eLrzrQsJ
50 | journals/tods/FernandezKSMT02,ckrgSn0vBOMJ
51 | conf/sigmod/Kramer97,noTo81QxmHQJ
52 | conf/sigmod/BerchtoldK98,VuMPPr6k95AJ
53 | journals/vldb/LiR99,Ko9e8CH2Si4J
54 | journals/sigmod/GrayG97,ruMkEFagTIUJ
55 | conf/sigmod/LeeKOTM00,C8RLbWKCgicJ
56 | conf/sigmod/GibbonsM98,x4HkJDEYFmYJ
57 | conf/sigmod/AshleyFHLNP95,9uxj2XzGt9UJ
58 | conf/sigmod/GardarinMP95,IOqPoq2MSvQJ
59 | conf/sigmod/Brown99,itP4yy9sLUUJ
60 | conf/sigmod/AcharyaGPR99a,IkNOhDqEY18J
61 | conf/sigmod/HuangSW94,xF8s5N7oUIMJ
62 | journals/tods/GuoSW96,V8Ls_TYs6mgJ
63 | conf/vldb/CeriM03,G2QuI5QYYMoJ
64 | conf/vldb/ChristodoulakisTZ97,D0z0BDnbnFcJ
65 | conf/sigmod/HammerGNYBV97,2rysKgS6lugJ
66 | conf/sigmod/BhattacharyaMBNHS02,cg01BqanXhUJ
67 | journals/sigmod/SouzaS99,OmYc0wE1j4kJ
68 | conf/vldb/WangC03,nXj_8Y7lmHsJ
69 | journals/sigmod/Baeza-YatesN96,cvchvz9-m_oJ
70 | conf/vldb/CosleyLP02,BTalXWt3faUJ
71 | conf/vldb/RothS97,DwwSuaisX5QJ
72 | conf/sigmod/GriffinL95,6QZGeKna5lgJ
73 | conf/sigmod/SrikantA96,tTQpdbZYZGoJ
74 | conf/sigmod/CranorGJSS02,Up9QQITiHzAJ
75 | journals/sigmod/HasanFV96,vIBvTfMLL4UJ
76 | journals/sigmod/BunemanRU97,5qg4BNiroqMJ
77 | journals/vldb/DanYC95,SsOLRJZrmtYJ
78 | journals/tods/CliffordDIJS97,f1wgD54UUKwJ
79 | conf/vldb/CareyD96,g8jnRVyQukAJ
80 | conf/sigmod/NybergBCGL94,fXziEl_Htv8J
81 | journals/vldb/HarrisR96,S8x6zjXc9oAJ
82 | journals/sigmod/HummerLW02,W1IcM8IUwAEJ
83 | journals/sigmod/DarwenD95,EXwe9r79qxEJ
84 | conf/sigmod/HungYK96,bTI28RjBpPwJ
85 | journals/sigmod/TatarinovIMHSDDKMM03,rrSZOoViGqoJ
86 | journals/vldb/LeeSC01,Hz_TU6kUj08J
87 | conf/sigmod/SagonasSW94a,xc8KEoeRT9sJ
88 | conf/vldb/ChakravarthyKAK94,3M_0Kd8NNjgJ
89 | conf/sigmod/LometW98,F2ecYx97F2sJ
90 | conf/vldb/AgrawalS94,cIJQ0qxrkMIJ
91 | journals/vldb/RahmB01,707RPzHAB8YJ
92 | conf/vldb/Hammond96,LDxiVdRU6EEJ
93 | journals/sigmod/SilberschatzSU96,soiN2U4tXykJ
94 | journals/sigmod/PourabbasR00,3y6y5AXu-j0J
95 | journals/vldb/PottingerH01,mpnnRdFUUJQJ
96 | conf/sigmod/MankuRL99,X94gDE70Zn0J
97 | conf/vldb/MeccaCM01,Ph7ZpmdNOPEJ
98 |
--------------------------------------------------------------------------------
/docs/scaling_and_optimization.rst:
--------------------------------------------------------------------------------
1 | Scaling and Optimization
2 | ========================
3 |
4 | One important feature of RLTK is scalability. It can either work with very limited resources or utilize large amount of resources.
5 |
6 | Set proper arguments
7 | --------------------
8 |
9 | Some of the methods have optional / required arguments about buffer size, chunk size, queue size, etc. Giving them proper values according to your machine's specification can reduce a lot of unnecessary memory-disk swap operations.
10 |
11 | Parallel processing
12 | -------------------
13 |
14 | Here you need to use a package called `pyrallel `_.
15 |
16 | General parallel processing
17 | ```````````````````````````
18 |
19 | If you have some compute-intensive procedures and your machine has more than one CPU core, `pyrallel.ParallelProcessor` is a tool to try. You can find more detailed information in its API documentation, but in general, it encapsulates multiprocessing to do parallel computing and multithreading to do data collecting.
20 |
21 | .. code-block:: python
22 |
23 | result = []
24 |
25 | def heavy_calculation(x, y):
26 | return x * x, y + 5
27 |
28 | def output_handler(r1, r2):
29 | result.append(r1 if r1 > r2 else r2)
30 |
31 | pp = pyrallel.ParallelProcessor(8, mapper=heavy_calculation, collector=output_handler)
32 | pp.start()
33 |
34 | for i in range(8):
35 | pp.add_task(i, i + 1)
36 |
37 | pp.task_done()
38 | pp.join()
39 |
40 | print(result)
41 |
42 |
43 | MapReduce
44 | `````````
45 |
46 | The above solution uses one thread (in main process) for collecting calculated data. If you want to do something like divide and conquer, especially when "conquer" needs heavy calculation, you may need `pyrallel.MapReduce` module.
47 |
48 | .. code-block:: python
49 |
50 | def mapper(x):
51 | time.sleep(0.0001)
52 | return x
53 |
54 | def reducer(r1, r2):
55 | return r1 + r2
56 |
57 | mr = pyrallel.MapReduce(8, mapper, reducer)
58 | for i in range(10000):
59 | mr.add_task(i)
60 |
61 | mr.task_done()
62 | result = mr.join()
63 | print(result)
64 |
65 | Distributed computing (Experimental)
66 | ------------------------------------
67 |
68 | .. note::
69 |
70 | It's not true that running RLTK on one machine is slower than on cluster, performance depends on requirement, data and code. If you only have tiny datasets and light task, Parallel computing is also not needed, creating processes and thread context switching all have costs. Similarly, distributed computing has more cost on IO (especially network) and it's more hard to do debugging, use it when you really need it. For most of the time, refactor code may have a boosting effect.
71 |
72 | If you have an extremely heavy computation work or very large datasets, and you also have multiple idle machines, you may consider to use distributed computing. More detailed usage is in API documentation :doc:`mod_remote`.
73 |
74 | First you need to set up a cluster. Cluster is formed by one scheduler and a bunch of workers.
75 |
76 | To start a scheduler, do
77 |
78 | .. code-block:: bash
79 |
80 | python -m rltk remote.scheduler
81 |
82 | Then on worker machines, do
83 |
84 | .. code-block:: bash
85 |
86 | python -m rltk remote.worker :8786 --nprocs
87 |
88 | Second, change a bit of your code and run it. The API for distributed computing is really like `pyrallel.ParallelProcessor`. But you need a `rltk.remote.Remote` object which connects to the scheduler and an instance of `rltk.remote.Task` which has a input and a output handler.
89 |
90 | .. code-block:: python
91 |
92 | def input_handler(r1, r2):
93 | return r1, r2, is_pair(r1, r2)
94 |
95 | def output_handler(r1, r2, label):
96 | print(r1.id, r2.id, label)
97 |
98 | remote = rltk.remote.Remote('127.0.0.1:8786')
99 | task = rltk.remote.Task(remote, input_handler=input_handler, output_handler=output_handler)
100 | task.start()
101 |
102 | for r1, r2 in rltk.get_record_pairs(ds1, ds2):
103 | task.compute(r1, r2)
104 |
105 | task.task_done()
106 | task.join()
107 |
108 | If data is in shared data store (file systems or services), there's no need to transfer record data through scheduler to worker but record id. Then workers can get data directly from data store. So change your code to make `input_handler` accepts id as input and fetch the record data in it.
109 |
110 | .. code-block:: python
111 | :emphasize-lines: 1,2,9
112 |
113 | def input_handler(id1, id2):
114 | r1, r2 = ds1.get(id1), ds2.get(id2)
115 | return is_pair(r1, r2)
116 |
117 | task = rltk.remote.Task(remote, input_handler=input_handler, output_handler=output_handler)
118 | task.start()
119 |
120 | for r1, r2 in rltk.get_record_pairs(ds1, ds2):
121 | task.compute(r1.id, r2.id)
122 |
123 | task.task_done()
124 | task.join()
125 |
--------------------------------------------------------------------------------
/examples/basic/auto_record.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import rltk
3 |
4 | print('from dataframe...')
5 |
6 | df = pd.read_csv('ds1.csv', encoding='latin-1')
7 | df['id'] = df['doc_id'].astype('str')
8 |
9 |
10 | class DFRecord(rltk.AutoGeneratedRecord):
11 | pass
12 |
13 |
14 | ds = rltk.Dataset(rltk.DataFrameReader(df), record_class=DFRecord)
15 | for r in ds:
16 | print(r.id, r.doc_id, r.doc_value)
17 |
18 |
19 | print('set id column...')
20 |
21 |
22 | @rltk.set_id('col1', function_=lambda x: str(x), keep_original=True)
23 | class DFRecord2(rltk.AutoGeneratedRecord):
24 | pass
25 |
26 |
27 | df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
28 | ds = rltk.Dataset(reader=rltk.DataFrameReader(df), record_class=DFRecord2)
29 | for r in ds:
30 | print(r.id, r.col1, r.col2)
31 |
--------------------------------------------------------------------------------
/examples/basic/basic.py:
--------------------------------------------------------------------------------
1 | import rltk
2 |
3 |
4 | class Record1(rltk.Record):
5 | @property
6 | def id(self):
7 | return self.raw_object['doc_id']
8 |
9 | @property
10 | def value(self):
11 | return self.raw_object['doc_value']
12 |
13 | @property
14 | def parent_id(self):
15 | return '4' if self.id == '1' else None
16 |
17 |
18 | class Record2(rltk.Record):
19 | @rltk.cached_property
20 | def id(self):
21 | return self.raw_object['ident']
22 |
23 | @rltk.cached_property
24 | def value(self):
25 | v = self.raw_object.get('values', list())
26 | return v[0] if len(v) > 0 else 'empty'
27 |
28 | ds1 = rltk.Dataset(reader=rltk.CSVReader('ds1.csv'),
29 | record_class=Record1, adapter=rltk.MemoryKeyValueAdapter())
30 | ds2 = rltk.Dataset(reader=rltk.JsonLinesReader('ds2.jl'),
31 | record_class=Record2, adapter=rltk.DbmKeyValueAdapter('file_index'))
32 |
33 | pairs = rltk.get_record_pairs(ds1, ds2)
34 | for r1, r2 in pairs:
35 | print('-------------')
36 | print(r1.id, r1.value, '\t', r2.id, r2.value)
37 | if r1.parent_id:
38 | print('r1\'s parent', r1.parent_id, ds1.get_record(r1.parent_id).value)
39 | print('levenshtein_distance:', rltk.levenshtein_distance(r1.value, r2.value))
40 | print('levenshtein_similarity:', rltk.levenshtein_similarity(r1.value, r2.value))
41 |
--------------------------------------------------------------------------------
/examples/basic/cached_record.py:
--------------------------------------------------------------------------------
1 | import rltk
2 |
3 |
4 | @rltk.remove_raw_object
5 | class Record1(rltk.Record):
6 |
7 | @rltk.cached_property
8 | def id(self):
9 | print('--> compute id:', self.raw_object['doc_id'])
10 | return self.raw_object['doc_id']
11 |
12 | @rltk.cached_property
13 | def value(self):
14 | print('--> compute value:', self.raw_object['doc_value'])
15 | return self.raw_object['doc_value']
16 |
17 | @property
18 | def id_and_value(self):
19 | print('--> compute id_and_value')
20 | return self.id + '-' + self.value
21 |
22 |
23 | arr = [
24 | {'doc_id': '1', 'doc_value': 'a'},
25 | {'doc_id': '2', 'doc_value': 'b'},
26 | {'doc_id': '3', 'doc_value': 'c'}
27 | ]
28 | # adapter = rltk.RedisKeyValueAdapter(host='127.0.0.1', key_prefix='cached_)
29 | adapter = rltk.HBaseKeyValueAdapter(host='127.0.0.1', key_prefix='test_', table='rltk_test1')
30 | ds1 = rltk.Dataset(reader=rltk.ArrayReader(arr), record_class=Record1, adapter=adapter)
31 | for r1 in ds1:
32 | print('------------')
33 | print('id:', r1.id)
34 | print('value:', r1.value)
35 | print('id_and_value:', r1.id_and_value)
36 | print('cache in dict:', r1.__dict__)
37 |
--------------------------------------------------------------------------------
/examples/basic/dedup.py:
--------------------------------------------------------------------------------
1 | import rltk
2 |
3 |
4 | raw_inputs = [
5 | {'name': 'a1', 'age': 10, 'id': 1},
6 | {'name': 'a2', 'age': 20, 'id': 2},
7 | {'name': 'a3', 'age': 30, 'id': 3},
8 | {'name': 'a3', 'age': 30, 'id': 4},
9 | {'name': 'a1', 'age': 10, 'id': 5},
10 | ]
11 |
12 |
13 | class MyRecord(rltk.Record):
14 |
15 | @property
16 | def id(self):
17 | return str(self.raw_object['id'])
18 |
19 | @property
20 | def name(self):
21 | return self.raw_object['name']
22 |
23 | @property
24 | def age(self):
25 | return self.raw_object['age']
26 |
27 |
28 | ds = rltk.Dataset(reader=rltk.ArrayReader(raw_inputs), record_class=MyRecord)
29 | for r, r_ in rltk.get_record_pairs(ds):
30 | print('comparing', r.id, r_.id, r.name == r_.name and r.age == r_.age)
31 |
--------------------------------------------------------------------------------
/examples/basic/ds1.csv:
--------------------------------------------------------------------------------
1 | doc_id,doc_value
2 | 1,hello
3 | 2,world
4 | 3,foo
5 | 4,bar
--------------------------------------------------------------------------------
/examples/basic/ds2.jl:
--------------------------------------------------------------------------------
1 | {"ident": "a", "values":["a1"]}
2 | {"ident": "b", "values":["b1", "b2"]}
3 | {"ident": "c", "values":["c1"]}
4 |
5 | {"ident": "d"}
6 |
--------------------------------------------------------------------------------
/examples/blocking/block_io_operations.py:
--------------------------------------------------------------------------------
1 | import rltk
2 |
3 | b1 = rltk.Block()
4 | b1.add('001', '1', '1')
5 | b1.add('001', '2', 'a')
6 | b1.add('002', '1', '2')
7 | b1.add('002', '2', 'b')
8 | b1.add('002', '2', 'c')
9 | print('--- block1 ---')
10 | for bb in b1:
11 | print(bb)
12 |
13 | b2 = rltk.Block()
14 | b2.add('001', '1', '1')
15 | b2.add('001', '2', 'a')
16 | b2.add('001', '2', 'd')
17 | b2.add('002', '1', '1')
18 | b2.add('002', '2', 'c')
19 | b2.add('002', '3', 'k')
20 | print('--- block2 (pairwise) ---')
21 | for bb in b2.pairwise('1', '2'):
22 | print(bb)
23 | print('--- block2 (pairwise, single dataset) ---')
24 | for bb in b2.pairwise('2'):
25 | print(bb)
26 |
27 | b1_inverted = rltk.BlockingHelper.generate_inverted_indices(b1)
28 | b2_inverted = rltk.BlockingHelper.generate_inverted_indices(b2)
29 | b3 = rltk.BlockingHelper.union(b1, b1_inverted, b2, b2_inverted)
30 | print('--- union ---')
31 | for bb in b3:
32 | print(bb)
33 | print('--- union raw ---')
34 | for rr in b3.key_set_adapter:
35 | print(rr)
36 |
37 | b4 = rltk.BlockingHelper.intersect(b1, b1_inverted, b2, b2_inverted)
38 | print('--- intersect --')
39 | for bb in b4:
40 | print(bb)
41 | print('--- intersect raw --')
42 | for rr in b4.key_set_adapter:
43 | print(rr)
44 |
--------------------------------------------------------------------------------
/examples/blocking/canopy.py:
--------------------------------------------------------------------------------
1 | import rltk
2 | import math
3 |
4 |
5 | @rltk.remove_raw_object
6 | class Record1(rltk.Record):
7 |
8 | @rltk.cached_property
9 | def id(self):
10 | return self.raw_object['doc_id']
11 |
12 | @rltk.cached_property
13 | def first_name(self):
14 | return self.raw_object['first name']
15 |
16 | @rltk.cached_property
17 | def last_name(self):
18 | return self.raw_object['last name']
19 |
20 | @property
21 | def full_name(self):
22 | return self.first_name + ' ' + self.last_name
23 |
24 |
25 | @rltk.remove_raw_object
26 | class Record2(rltk.Record):
27 |
28 | @rltk.cached_property
29 | def id(self):
30 | return self.raw_object['ident']
31 |
32 | @rltk.cached_property
33 | def first_name(self):
34 | return self.raw_object['name'].split(' ')[0]
35 |
36 | @rltk.cached_property
37 | def last_name(self):
38 | return self.raw_object['name'].split(' ')[1]
39 |
40 | @property
41 | def full_name(self):
42 | return self.first_name + ' ' + self.last_name
43 |
44 | ds1 = rltk.Dataset(reader=rltk.CSVReader('ds1.csv', delimiter=','), record_class=Record1)
45 | ds2 = rltk.Dataset(reader=rltk.JsonLinesReader('ds2.jl'), record_class=Record2)
46 |
47 | # for r in ds1:
48 | # print(r.first_name)
49 | # for r in ds2:
50 | # print(r.first_name)
51 |
52 |
53 | def vectorize(r):
54 | return [ord(r.first_name[0].lower()) - 0x61, 2]
55 |
56 |
57 | def distance_metric(vec1, vec2):
58 | vec1, vec2 = float(vec1[0]), float(vec2[0])
59 | return math.sqrt((vec1 - vec2) ** 2)
60 |
61 | bg = rltk.CanopyBlockGenerator(t1=10, t2=5, distance_metric=distance_metric)
62 | block = bg.generate(
63 | bg.block(ds1, function_=vectorize),
64 | bg.block(ds2, function_=vectorize))
65 | pairs = rltk.get_record_pairs(ds1, ds2, block=block)
66 | for r1, r2 in pairs:
67 | print(r1.id, r1.full_name, '\t', r2.id, r2.full_name)
68 |
--------------------------------------------------------------------------------
/examples/blocking/ds1.csv:
--------------------------------------------------------------------------------
1 | doc_id,first name,last name
2 | 1,alice, A
3 | 2,bob, A
4 | 3,craig, B
5 | 4,david, C
--------------------------------------------------------------------------------
/examples/blocking/ds2.jl:
--------------------------------------------------------------------------------
1 | {"ident": "a", "name":"alien wong"}
2 | {"ident": "b", "name":"bob lee"}
3 | {"ident": "c", "name":"deck knight"}
4 | {"ident": "d", "name":"joe martin"}
5 |
--------------------------------------------------------------------------------
/examples/blocking/generate_blocks.py:
--------------------------------------------------------------------------------
1 | import rltk
2 |
3 |
4 | @rltk.remove_raw_object
5 | class Record1(rltk.Record):
6 |
7 | @rltk.cached_property
8 | def id(self):
9 | return self.raw_object['doc_id']
10 |
11 | @rltk.cached_property
12 | def first_name(self):
13 | return self.raw_object['first name']
14 |
15 | @rltk.cached_property
16 | def last_name(self):
17 | return self.raw_object['last name']
18 |
19 |
20 | @rltk.remove_raw_object
21 | class Record2(rltk.Record):
22 |
23 | @rltk.cached_property
24 | def id(self):
25 | return self.raw_object['ident']
26 |
27 | @rltk.cached_property
28 | def first_name(self):
29 | return self.raw_object['name'].split(' ')[0]
30 |
31 | @rltk.cached_property
32 | def last_name(self):
33 | return self.raw_object['name'].split(' ')[1]
34 |
35 |
36 | ds1 = rltk.Dataset(reader=rltk.CSVReader('ds1.csv', delimiter=','),
37 | record_class=Record1, adapter=rltk.MemoryKeyValueAdapter())
38 | ds2 = rltk.Dataset(reader=rltk.JsonLinesReader('ds2.jl'),
39 | record_class=Record2, adapter=rltk.MemoryKeyValueAdapter())
40 |
41 | print('--- block on first_name ---')
42 | bg = rltk.HashBlockGenerator()
43 | block = bg.generate(bg.block(ds1, property_='first_name'),
44 | bg.block(ds2, property_='first_name'))
45 |
46 | pairs = rltk.get_record_pairs(ds1, ds2, block=block)
47 | for r1, r2 in pairs:
48 | print(r1.id, r1.first_name, '\t', r2.id, r2.first_name)
49 |
50 |
51 | print('--- block on first_name[:1] ---')
52 | bg2 = rltk.HashBlockGenerator()
53 | block2 = bg2.generate(
54 | bg2.block(ds1, function_=lambda r: r.first_name[:1]),
55 | bg2.block(ds2, function_=lambda r: r.first_name[:1]))
56 |
57 | pairs = rltk.get_record_pairs(ds1, ds2, block=block2)
58 | for r1, r2 in pairs:
59 | print(r1.id, r1.first_name, '\t', r2.id, r2.first_name)
60 |
61 |
62 | print('--- block on first_name[:3] based on previous blocks ---')
63 | bg3 = rltk.HashBlockGenerator()
64 | block3 = bg3.generate(
65 | bg3.block(ds1, function_=lambda r: r.first_name[:3], base_on=block2),
66 | bg3.block(ds2, function_=lambda r: r.first_name[:3], base_on=block2))
67 | pairs = rltk.get_record_pairs(ds1, ds2, block=block3)
68 | for r1, r2 in pairs:
69 | print(r1.id, r1.first_name, '\t', r2.id, r2.first_name)
70 | print('inside blocks:')
71 | for b, d, r in block3:
72 | print(b, d, r)
73 |
--------------------------------------------------------------------------------
/examples/blocking/inverted_index.py:
--------------------------------------------------------------------------------
1 | import rltk
2 |
3 |
4 | @rltk.remove_raw_object
5 | class Record1(rltk.Record):
6 |
7 | @rltk.cached_property
8 | def id(self):
9 | return self.raw_object['doc_id']
10 |
11 | @rltk.cached_property
12 | def first_name(self):
13 | return self.raw_object['first name']
14 |
15 | @rltk.cached_property
16 | def last_name(self):
17 | return self.raw_object['last name']
18 |
19 | @property
20 | def full_name(self):
21 | return self.first_name + ' ' + self.last_name
22 |
23 |
24 | @rltk.remove_raw_object
25 | class Record2(rltk.Record):
26 |
27 | @rltk.cached_property
28 | def id(self):
29 | return self.raw_object['ident']
30 |
31 | @rltk.cached_property
32 | def first_name(self):
33 | return self.raw_object['name'].split(' ')[0]
34 |
35 | @rltk.cached_property
36 | def last_name(self):
37 | return self.raw_object['name'].split(' ')[1]
38 |
39 | @property
40 | def full_name(self):
41 | return self.first_name + ' ' + self.last_name
42 |
43 | ds1 = rltk.Dataset(reader=rltk.CSVReader('ds1.csv', delimiter=','), record_class=Record1)
44 | ds2 = rltk.Dataset(reader=rltk.JsonLinesReader('ds2.jl'), record_class=Record2)
45 |
46 | ngram = rltk.NGramTokenizer()
47 |
48 | bg = rltk.TokenBlockGenerator()
49 | block1 = bg.block(ds1, function_=lambda r: ngram.basic(r.first_name, 3),
50 | block=rltk.Block(rltk.LevelDbKeySetAdapter('block_store', 'b1', clean=True)))
51 | block2 = bg.block(ds2, function_=lambda r: ngram.basic(r.first_name, 3),
52 | block=rltk.Block(rltk.LevelDbKeySetAdapter('block_store', 'b2', clean=True)))
53 | block3 = bg.generate(block1, block2, rltk.Block(rltk.LevelDbKeySetAdapter('block_store', 'b3', clean=True)))
54 | pairs = rltk.get_record_pairs(ds1, ds2, block=block3)
55 | for r1, r2 in pairs:
56 | print(r1.id, r1.full_name, '\t', r2.id, r2.full_name)
57 |
--------------------------------------------------------------------------------
/examples/evaluation/.gitignore:
--------------------------------------------------------------------------------
1 | saved_ground_truth.csv
--------------------------------------------------------------------------------
/examples/evaluation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/usc-isi-i2/rltk/aee10ed5dd561583e60db3373ed82fe1208da1e9/examples/evaluation/__init__.py
--------------------------------------------------------------------------------
/examples/evaluation/construct_datasets.py:
--------------------------------------------------------------------------------
1 | import rltk
2 |
3 |
4 | @rltk.remove_raw_object
5 | class EvaluationRecord(rltk.Record):
6 | @rltk.cached_property
7 | def id(self):
8 | return self.raw_object['id']
9 |
10 | @rltk.cached_property
11 | def name(self):
12 | return self.raw_object['name']
13 |
14 | @rltk.cached_property
15 | def laptop(self):
16 | return self.raw_object['laptop_brand']
17 |
18 |
19 | @rltk.remove_raw_object
20 | class EvaluationRecord2(rltk.Record):
21 | @rltk.cached_property
22 | def id(self):
23 | return self.raw_object['id']
24 |
25 | @rltk.cached_property
26 | def name(self):
27 | return self.raw_object['name']
28 |
29 | @rltk.cached_property
30 | def laptop(self):
31 | return self.raw_object['laptop']
32 |
33 |
34 | dataset_1_file_name = 'data_1.csv'
35 | dataset_2_file_name = 'data_2.csv'
36 |
37 | ds1 = rltk.Dataset(reader=rltk.CSVReader(dataset_1_file_name),
38 | record_class=EvaluationRecord)
39 | ds2 = rltk.Dataset(reader=rltk.CSVReader(dataset_2_file_name),
40 | record_class=EvaluationRecord2)
41 |
--------------------------------------------------------------------------------
/examples/evaluation/data_1.csv:
--------------------------------------------------------------------------------
1 | id,name,laptop_brand
2 | 0,Jerry Li,Alienware
3 | 1,Jeremy Yin,Apple
4 | 2,Jack Liu,HP
5 | 3,John Xi,Apple
6 |
--------------------------------------------------------------------------------
/examples/evaluation/data_2.csv:
--------------------------------------------------------------------------------
1 | id,name,laptop
2 | 10,jerry,Alienware 6
3 | 11,jeremy,Macbook Pro
4 | 12,jack,HP Envy X360
5 | 13,john,Macbook Pro
--------------------------------------------------------------------------------
/examples/evaluation/generate_negative_gt.py:
--------------------------------------------------------------------------------
1 | from construct_datasets import *
2 |
3 |
4 | print('generate negatives')
5 | gt = rltk.GroundTruth()
6 | gt.load('gt_positive_only.csv')
7 |
8 |
9 | def score_function(r1, r2):
10 | return rltk.levenshtein_similarity(r1.name, r2.name)
11 |
12 | gt.generate_negatives(ds1, ds2, score_function=score_function)
13 |
14 | for id1, id2, label in gt:
15 | print(id1, id2, label)
16 |
17 |
18 | print('generate all negatives')
19 | gt1 = rltk.GroundTruth()
20 | gt1.load('gt_positive_only.csv')
21 |
22 | gt1.generate_all_negatives(ds1, ds2)
23 | for id1, id2, label in gt1:
24 | print(id1, id2, label)
25 |
26 |
27 | print('generate stratified negatives')
28 | gt2 = rltk.GroundTruth()
29 | gt2.load('gt_positive_only.csv')
30 |
31 |
32 | num_of_cluster = 3
33 | curr = -1
34 |
35 |
36 | def classify(r1, f2):
37 | global curr
38 | curr = (curr + 1) % num_of_cluster
39 | return curr
40 |
41 | gt2.generate_stratified_negatives(ds1, ds2, classify, num_of_cluster)
42 | for id1, id2, label in gt2:
43 | print(id1, id2, label)
--------------------------------------------------------------------------------
/examples/evaluation/ground_truth.csv:
--------------------------------------------------------------------------------
1 | id1,id2,label
2 | 0,10,True
3 | 0,11,False
4 | 0,12,False
5 | 0,13,False
6 | 1,10,False
7 | 1,11,True
8 | 1,12,False
9 | 1,13,False
10 | 2,10,False
11 | 2,11,False
12 | 2,12,True
13 | 2,13,False
14 | 3,10,False
15 | 3,11,False
16 | 3,12,False
17 |
--------------------------------------------------------------------------------
/examples/evaluation/gt_positive_only.csv:
--------------------------------------------------------------------------------
1 | id1,id2,label
2 | 0,11,True
3 | 1,13,True
4 |
--------------------------------------------------------------------------------
/examples/evaluation/run_evaluation.py:
--------------------------------------------------------------------------------
1 | from construct_datasets import *
2 |
3 | saved_ground_truth_file_name = 'ground_truth.csv'
4 | gt = rltk.GroundTruth()
5 | gt.load(saved_ground_truth_file_name)
6 |
7 | gt.add_ground_truth('3', '13', True)
8 | gt.save('saved_' + saved_ground_truth_file_name)
9 |
10 | eva = rltk.Evaluation()
11 |
12 | for min_confidence_100 in range(0, 100):
13 | threshold = min_confidence_100 / 100
14 | trial = rltk.Trial(gt, min_confidence=0, top_k=0,
15 | label='min threshold is: {}'.format(threshold), threshold=threshold)
16 | pairs = rltk.get_record_pairs(ds1, ds2)
17 | for r1, r2 in pairs:
18 | c = 0.3 * rltk.levenshtein_similarity(r1.name, r2.name) + 0.7 * rltk.levenshtein_similarity(r1.laptop, r2.laptop)
19 | p = (c >= threshold)
20 | trial.add_result(r1, r2, p, c)
21 |
22 | trial.evaluate()
23 | eva.add_trial(trial)
24 |
25 | # coord = [
26 | # {
27 | # 'x': 'threshold',
28 | # 'y': 'false_positives',
29 | # 'label': '123'
30 | # },
31 | # {
32 | # 'x': 'threshold',
33 | # 'y': 'true_positives',
34 | # 'label': '456',
35 | # 'linestyle': '--'
36 | # },
37 | # {
38 | # 'x': 'recall',
39 | # 'y': 'precision',
40 | # 'label': 'pr',
41 | # 'linestyle': '--'
42 | # }
43 | # ]
44 | # eva.plot(coord)
45 | eva.plot_precision_recall().show()
46 |
47 | eva.plot_roc().show()
48 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | Cython>=0.28.0
2 | numpy>=1.17.0
3 | pandas>=1.2.0
4 | scipy>=1.1.0
5 | matplotlib>=2.0.0
6 | dask>=0.19.2
7 | distributed>=1.23
8 | pyrallel.lib
9 |
--------------------------------------------------------------------------------
/requirements_dev.txt:
--------------------------------------------------------------------------------
1 | redis>=2.0.0
2 | happybase>=1.1.0
3 | plyvel>=1.0.5
4 | pytest
5 | pytest-cov<2.6
6 |
7 |
--------------------------------------------------------------------------------
/requirements_docs.txt:
--------------------------------------------------------------------------------
1 | -r requirements.txt
2 | -r requirements_dev.txt
3 | alabaster>=0.7.9
4 | Sphinx>=1.5.6
5 | sphinx-autobuild>=0.6.0
6 | sphinxcontrib-napoleon>=0.6.0
7 | nbsphinx>=0.3.4
8 | pandoc>=1.0.2
9 | tornado==4.5.3
10 | jupyter>=1.0.0
11 |
--------------------------------------------------------------------------------
/rltk/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = '2.0.0-a020'
2 |
3 | from rltk.record import Record, AutoGeneratedRecord,\
4 | cached_property, generate_record_property_cache, validate_record, remove_raw_object, set_id
5 | from rltk.dataset import Dataset
6 | from rltk.io import *
7 | from rltk.similarity import *
8 | from rltk.blocking import *
9 | from rltk.tokenizer import *
10 | from rltk.evaluation import *
11 | from rltk.utils import candidate_pairs, get_record_pairs
12 | import rltk.cli
13 | import rltk.remote
14 |
--------------------------------------------------------------------------------
/rltk/__main__.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import os
3 | import tempfile
4 | import logging
5 |
6 | from distributed.cli import dask_scheduler, dask_worker
7 |
8 |
9 | def help_info():
10 | print('Available commands:')
11 | print('remote.worker, remote.scheduler')
12 |
13 |
14 | if __name__ == '__main__':
15 | if len(sys.argv) <= 1:
16 | print('No command\n')
17 | help_info()
18 | sys.exit()
19 |
20 | cmd = sys.argv[1]
21 | sub_cmd = sys.argv[2:] if len(sys.argv) >= 3 else []
22 | sys.argv.pop(1)
23 |
24 | if cmd in ('help', '--help', 'h', '-h'):
25 | help_info()
26 | sys.exit()
27 |
28 | sys.argv[0] = cmd # replace prog name
29 | temp_path = os.path.join(tempfile.gettempdir(), 'rltk', 'remote')
30 | if not os.path.exists(temp_path):
31 | os.makedirs(temp_path, exist_ok=True)
32 | if cmd == 'remote.worker':
33 | logger = logging.getLogger('distributed.dask_worker')
34 | logger.setLevel(logging.ERROR)
35 | sys.argv.append('--local-directory')
36 | sys.argv.append(temp_path)
37 | # sys.argv.append('--change-directory')
38 | sys.exit(dask_worker.go())
39 | elif cmd == 'remote.scheduler':
40 | logger = logging.getLogger('distributed.scheduler')
41 | logger.setLevel(logging.ERROR)
42 | sys.argv.append('--local-directory')
43 | sys.argv.append(temp_path)
44 | sys.exit(dask_scheduler.go())
45 | else:
46 | print('Unknown command\n')
47 | help_info()
48 |
49 | sys.exit()
50 |
--------------------------------------------------------------------------------
/rltk/blocking/__init__.py:
--------------------------------------------------------------------------------
1 | from rltk.blocking.block import Block
2 | from rltk.blocking.block_black_list import BlockBlackList
3 | from rltk.blocking.block_generator import BlockGenerator
4 | from rltk.blocking.hash_block_generator import HashBlockGenerator
5 | from rltk.blocking.token_block_generator import TokenBlockGenerator
6 | from rltk.blocking.canopy_block_generator import CanopyBlockGenerator
7 | from rltk.blocking.sorted_neighbourhood_block_generator import SortedNeighbourhoodBlockGenerator
8 | from rltk.blocking.blocking_helper import BlockingHelper
9 |
10 | Blocker = BlockGenerator
11 | HashBlocker = HashBlockGenerator
12 | TokenBlocker = TokenBlockGenerator
13 | CanopyBlocker = CanopyBlockGenerator
14 | SortedNeighbourhoodBlocker = SortedNeighbourhoodBlockGenerator
15 |
--------------------------------------------------------------------------------
/rltk/blocking/block.py:
--------------------------------------------------------------------------------
1 | import itertools
2 |
3 | from rltk.io.adapter.key_set_adapter import KeySetAdapter
4 | from rltk.io.adapter.memory_key_set_adapter import MemoryKeySetAdapter
5 | from rltk.dataset import Dataset
6 | from rltk.record import Record
7 |
8 |
9 | class Block(object):
10 | """
11 | Block
12 |
13 | key_set_adapter (keySetAdapter, optional): Where the block stores. If it's None,
14 | :meth:`MemoryKeySetAdapter` is used. Defaults to None.
15 | """
16 | def __init__(self, key_set_adapter: KeySetAdapter = None):
17 | if not key_set_adapter:
18 | key_set_adapter = MemoryKeySetAdapter()
19 | self.key_set_adapter = key_set_adapter
20 |
21 | def add(self, block_id, dataset_id, record_id):
22 | """
23 | Add to block.
24 |
25 | Args:
26 | block_id (str): Block id.
27 | dataset_id (str / Dataset): Dataset id or Dataset object.
28 | record_id (str / Record): Record id or Record object.
29 | """
30 | if isinstance(dataset_id, Dataset):
31 | dataset_id = dataset_id.id
32 | if isinstance(record_id, Record):
33 | record_id = record_id.id
34 | self.key_set_adapter.add(block_id, (dataset_id, record_id))
35 |
36 | def get(self, block_id):
37 | """
38 | Get block by block_id.
39 |
40 | Args:
41 | block_id (str): Block id.
42 |
43 | Returns:
44 | set: {(dataset_id, record_id)}
45 | """
46 | return self.key_set_adapter.get(block_id)
47 |
48 | def __iter__(self):
49 | """
50 | Same as :meth:`__next__`
51 | """
52 | return self.__next__()
53 |
54 | def __next__(self):
55 | """
56 | Iterator of blocks.
57 |
58 | Returns:
59 | iter: block_id, dataset_id, record_id.
60 | """
61 | for block_id, data in self.key_set_adapter:
62 | for dataset_id, record_id in data:
63 | yield block_id, dataset_id, record_id
64 |
65 | def pairwise(self, ds_id1: str, ds_id2: str = None):
66 | """
67 | Iterator of id pairs generated according to blocks.
68 |
69 | Returns:
70 | iter: block_id, id1, id2.
71 | """
72 | if isinstance(ds_id1, Dataset):
73 | ds_id1 = ds_id1.id
74 | if ds_id2 and isinstance(ds_id2, Dataset):
75 | ds_id2 = ds_id2.id
76 |
77 | if ds_id2:
78 | for block_id, data in self.key_set_adapter:
79 | # fetch one block
80 | ds1, ds2 = list(), list()
81 | for dataset_id, record_id in data:
82 | if dataset_id == ds_id1:
83 | ds1.append(record_id)
84 | elif dataset_id == ds_id2:
85 | ds2.append(record_id)
86 |
87 | # cross product
88 | for id1, id2 in itertools.product(ds1, ds2):
89 | yield block_id, id1, id2
90 | else:
91 | for block_id, data in self.key_set_adapter:
92 | # fetch one block
93 | ds1 = list()
94 | for dataset_id, record_id in data:
95 | if dataset_id == ds_id1:
96 | ds1.append(record_id)
97 |
98 | # combinations of two elements
99 | for ds1, ds1_ in itertools.combinations(ds1, 2):
100 | yield block_id, ds1, ds1_
101 |
--------------------------------------------------------------------------------
/rltk/blocking/block_black_list.py:
--------------------------------------------------------------------------------
1 | from rltk.io.adapter.key_set_adapter import KeySetAdapter
2 | from rltk.io.adapter.memory_key_set_adapter import MemoryKeySetAdapter
3 | from rltk.blocking.block import Block
4 |
5 |
6 | class BlockBlackList(object):
7 | """
8 | Block black list
9 |
10 | Args:
11 | key_set_adapter (keySetAdapter, optional): Where the block stores. If it's None,
12 | :meth:`MemoryKeySetAdapter` is used. Defaults to None.
13 | max_size (int, optional): Maximum size of a block. Used by :meth:`add`. Defaults to 0.
14 | """
15 | def __init__(self, key_set_adapter: KeySetAdapter = None, max_size: int = 0):
16 | if not key_set_adapter:
17 | key_set_adapter = MemoryKeySetAdapter()
18 | self.key_set_adapter = key_set_adapter
19 | self._max_size = max_size
20 |
21 | def has(self, block_id: str):
22 | """
23 | Test if block_id is in black list.
24 |
25 | Args:
26 | block_id (str): Block id.
27 | """
28 | return self.key_set_adapter.get(block_id) is not None
29 |
30 | def add(self, block_id: str, block: Block):
31 | """
32 | Add block_id to black list and update block data.
33 |
34 | Args:
35 | block_id (str): Block id.
36 | block (Block): Block object.
37 |
38 | Notes:
39 | * If `max_size` is 0, then block_id will be added.
40 | * If `max_size` is greater than 0 and data in this block is more than this size,
41 | this block_id will be added to BlockBlackList and this block is removed from Block.
42 | """
43 | if self._max_size > 0:
44 | d = block.key_set_adapter.get(block_id)
45 | if len(d) > self._max_size:
46 | self.key_set_adapter.set(block_id, set())
47 | block.key_set_adapter.delete(block_id)
48 | else:
49 | self.key_set_adapter.set(block_id, set())
50 |
51 | def __contains__(self, item):
52 | """
53 | Same as :meth:`has`
54 | """
55 | self.has(item)
56 |
--------------------------------------------------------------------------------
/rltk/blocking/block_generator.py:
--------------------------------------------------------------------------------
1 | from typing import Callable, TYPE_CHECKING
2 |
3 | if TYPE_CHECKING:
4 | from rltk.dataset import Dataset
5 | from rltk.blocking.block import Block
6 | from rltk.blocking.block_black_list import BlockBlackList
7 |
8 |
9 | class BlockGenerator(object):
10 | """
11 | Block generator.
12 | """
13 |
14 | def block(self, dataset: 'Dataset', function_: Callable = None, property_: str = None,
15 | block: Block = None, block_black_list: BlockBlackList = None, base_on: Block = None):
16 | """
17 | Block on property or by function for dataset.
18 |
19 | Args:
20 | dataset (Dataset): Dataset.
21 | function_ (Callable): `function_(r: record)`.
22 | property_ (str): The property in Record object.
23 | block (Block): Where to write blocks. If None, a new block will be created. Defaults to None.
24 | block_black_list (BlockBlackList, optional): Where all blacklisted blocks are stored. Defaults to None.
25 | base_on (Block, optional): Current block is generated base on this block. Defaults to None.
26 |
27 | Returns:
28 | Block:
29 | """
30 | block = BlockGenerator._block_args_check(function_, property_, block)
31 | return block
32 |
33 | @staticmethod
34 | def _block_args_check(function_, property_, block):
35 | if not function_ and not property_:
36 | raise ValueError('Invalid function or property')
37 | return block or Block()
38 |
39 | def generate(self, block1: Block, block2: Block, output_block: Block = None):
40 | """
41 | Generate block from two blocks of single dataset.
42 |
43 | Args:
44 | block1 (Block): Block 1.
45 | block2 (Block): Block 2.
46 | output_block (Block): Where the output block goes. If None, a new block will be created. Defaults to None.
47 |
48 | Returns:
49 | Block:
50 | """
51 | block = BlockGenerator._generate_args_check(output_block)
52 | return block
53 |
54 | @staticmethod
55 | def _generate_args_check(block):
56 | return block or Block()
57 |
--------------------------------------------------------------------------------
/rltk/blocking/blocking_helper.py:
--------------------------------------------------------------------------------
1 | import json
2 | import hashlib
3 | import operator
4 |
5 | from rltk.blocking.block import Block
6 | from rltk.io.adapter.key_set_adapter import KeySetAdapter
7 | from rltk.io.adapter.memory_key_set_adapter import MemoryKeySetAdapter
8 |
9 |
10 | class BlockingHelper(object):
11 | """
12 | Blocking Helper.
13 | """
14 |
15 | @staticmethod
16 | def encode_inverted_index_key(dataset_id, record_id):
17 | return json.dumps({'d': dataset_id, 'r': record_id}, sort_keys=True)
18 |
19 | @staticmethod
20 | def decode_inverted_index_key(key):
21 | key = json.loads(key)
22 | return key['d'], key['r']
23 |
24 | @staticmethod
25 | def generate_inverted_indices(block: Block, ks_adapter: KeySetAdapter = None):
26 | """
27 | Generate inverted indices of block.
28 |
29 | Args:
30 | block (Block): Original block.
31 | ks_adapter (KeySetAdapter): Where the inverted indices store.
32 |
33 | Returns:
34 | KeySetAdapter:
35 | """
36 | if not ks_adapter:
37 | ks_adapter = MemoryKeySetAdapter()
38 | for block_id, dataset_id, record_id in block:
39 | ks_adapter.add(BlockingHelper.encode_inverted_index_key(dataset_id, record_id), block_id)
40 | return ks_adapter
41 |
42 | @staticmethod
43 | def _block_operations(operator_, left_block, right_block, right_inverted, output_block):
44 | operation = None
45 | if operator_ == 'union':
46 | operation = operator.or_ # lambda a, b: a | b
47 | elif operator_ == 'intersect':
48 | operation = operator.and_ # lambda a, b: a & b
49 |
50 | for left_block_id, left_data in left_block.key_set_adapter:
51 | for left_dataset_id, left_record_id in left_data:
52 | key = BlockingHelper.encode_inverted_index_key(left_dataset_id, left_record_id)
53 | right_block_ids = right_inverted.get(key)
54 | if right_block_ids:
55 | for right_block_id in right_block_ids:
56 | new_block_data = operation(left_data, right_block.get(right_block_id))
57 | new_block_id = hashlib \
58 | .sha1(''.join(sorted(['{},{}'.format(ds, r) for ds, r in new_block_data]))
59 | .encode('utf-8')).hexdigest()
60 | output_block.key_set_adapter.set(new_block_id, new_block_data)
61 |
62 | @staticmethod
63 | def union(block1, inverted1, block2, inverted2, block3=None):
64 | """
65 | Union of two blocks.
66 |
67 | Args:
68 | block1 (Block): Block 1.
69 | inverted1 (KeySetAdapter): Inverted indices of block 1.
70 | block2 (Block): Block2.
71 | inverted2 (KeySetAdapter): Inverted indices of block 2.
72 | block3 (Block, optional): Unioned block. If None, a Block object will be created. Defaults to None.
73 |
74 | Returns:
75 | Block:
76 | """
77 | block3 = block3 or Block()
78 |
79 | BlockingHelper._block_operations('union', block1, block2, inverted2, block3)
80 | BlockingHelper._block_operations('union', block2, block1, inverted1, block3)
81 | return block3
82 |
83 | @staticmethod
84 | def intersect(block1, inverted1, block2, inverted2, block3=None):
85 | """
86 | Intersection of two blocks.
87 |
88 | Args:
89 | block1 (Block): Block 1.
90 | inverted1 (KeySetAdapter): Inverted indices of block 1.
91 | block2 (Block): Block2.
92 | inverted2 (KeySetAdapter): Inverted indices of block 2.
93 | block3 (Block, optional): Intersected block. If None, a Block object will be created. Defaults to None.
94 |
95 | Returns:
96 | Block:
97 | """
98 | block3 = block3 or Block()
99 |
100 | BlockingHelper._block_operations('intersect', block1, block2, inverted2, block3)
101 | BlockingHelper._block_operations('intersect', block2, block1, inverted1, block3)
102 | return block3
103 |
--------------------------------------------------------------------------------
/rltk/blocking/canopy_block_generator.py:
--------------------------------------------------------------------------------
1 | import json
2 | import random
3 | from typing import Callable
4 |
5 | from rltk.blocking.block_generator import BlockGenerator
6 | from rltk.blocking.block import Block
7 | from rltk.blocking.block_black_list import BlockBlackList
8 |
9 |
10 | class CanopyBlockGenerator(BlockGenerator):
11 | """
12 | Canopy based block generator.
13 |
14 | Args:
15 | t1 (float): The loose distance.
16 | t2 (float): The tight distance.
17 | distance_metric (Callable): Compute the distance between two vectors return from :meth:`block`.
18 | The signature is `distance(v1: List, v2: List) -> float`
19 | """
20 | def __init__(self, t1, t2, distance_metric):
21 | if t1 <= t2:
22 | raise ValueError('t1 should be greater than t2')
23 | if t2 <= 0:
24 | raise ValueError('t1 and t2 should greater than 0')
25 |
26 | self._t1 = t1
27 | self._t2 = t2
28 | self._distance_metric = distance_metric
29 |
30 | def block(self, dataset, function_: Callable = None, property_: str = None,
31 | block: Block = None, block_black_list: BlockBlackList = None, base_on: Block = None):
32 | """
33 | The return of `property_` or `function_` should be a vector (list).
34 | """
35 | block = super()._block_args_check(function_, property_, block)
36 |
37 | if base_on:
38 | raise Exception('Canopy currently doesn\'t support `base_on`')
39 | # for block_id, dataset_id, record_id in base_on:
40 | # if dataset.id == dataset_id:
41 | # r = dataset.get_record(record_id)
42 | # value = function_(r) if function_ else getattr(r, property_)
43 | # if not isinstance(value, list):
44 | # raise ValueError('Return of the function or property should be a vector (list)')
45 | # value = block_id + '-' + value
46 | # k = self._encode_key(value)
47 | # if block_black_list and block_black_list.has(k):
48 | # continue
49 | # block.add(k, dataset.id, r.id)
50 | # if block_black_list:
51 | # block_black_list.add(k, block)
52 |
53 | else:
54 | for r in dataset:
55 | value = function_(r) if function_ else getattr(r, property_)
56 | if not isinstance(value, list):
57 | raise ValueError('Return of the function or property should be a vector (list)')
58 | k = self._encode_key(value)
59 | if block_black_list and block_black_list.has(k):
60 | continue
61 | block.add(k, dataset.id, r.id)
62 | if block_black_list:
63 | block_black_list.add(k, block)
64 |
65 | return block
66 |
67 | @staticmethod
68 | def _encode_key(obj):
69 | return str(obj)
70 |
71 | @staticmethod
72 | def _decode_key(str_):
73 | return eval(str_)
74 |
75 | def generate(self, block1: Block, block2: Block, output_block: Block = None):
76 | output_block = BlockGenerator._generate_args_check(output_block)
77 | dataset = []
78 | for key, _ in block1.key_set_adapter:
79 | dataset.append(self._decode_key(key))
80 | for key, _ in block2.key_set_adapter:
81 | dataset.append(self._decode_key(key))
82 |
83 | clusters = self._run_canopy_clustering(dataset, self._t1, self._t2, self._distance_metric)
84 |
85 | for cid, c in enumerate(clusters):
86 | for vec in c:
87 | key = self._encode_key(vec)
88 | set_ = block1.get(key)
89 | if set_:
90 | for ds_id, rid in set_:
91 | output_block.add(cid, ds_id, rid)
92 | set_ = block2.get(key)
93 | if set_:
94 | for ds_id, rid in set_:
95 | output_block.add(cid, ds_id, rid)
96 | return output_block
97 |
98 | @staticmethod
99 | def _run_canopy_clustering(dataset, t1, t2, distance_metric):
100 | """
101 | The algorithm proceeds as follows, using two thresholds t1 (the loose distance) and t2 (the tight distance),
102 | where t1 > t2.
103 |
104 | 1. Begin with the set of data points to be clustered.
105 | 2. Remove a point from the set, beginning a new 'canopy' containing this point.
106 | 3. For each point left in the set, assign it to the new canopy \
107 | if its distance to the first point of the canopy is less than the loose distance t1.
108 | 4. If the distance of the point is additionally less than the tight distance t2,
109 | remove it from the original set.
110 | 5. Repeat from step 2 until there are no more data points in the set to cluster.
111 | """
112 | canopies = []
113 | while len(dataset) > 0:
114 | center_idx = random.randint(0, len(dataset) - 1)
115 | center_vec = dataset[center_idx]
116 | new_canopy = []
117 | delete_list = []
118 | del dataset[center_idx]
119 |
120 | for d_idx in range(len(dataset)):
121 | d = dataset[d_idx]
122 | distance = distance_metric(center_vec, d)
123 | if distance < t1:
124 | new_canopy.append(d)
125 | if distance < t2:
126 | delete_list.append(d_idx)
127 |
128 | # delete vector from dataset from backward
129 | for d_idx in sorted(delete_list, reverse=True):
130 | del dataset[d_idx]
131 | new_canopy.append(center_vec) # add center
132 | canopies.append(new_canopy)
133 | return canopies
134 |
--------------------------------------------------------------------------------
/rltk/blocking/hash_block_generator.py:
--------------------------------------------------------------------------------
1 | from typing import Callable, TYPE_CHECKING
2 |
3 | if TYPE_CHECKING:
4 | from rltk.dataset import Dataset
5 | from rltk.blocking.block_generator import BlockGenerator
6 | from rltk.blocking.block import Block
7 | from rltk.blocking.block_black_list import BlockBlackList
8 |
9 |
10 | class HashBlockGenerator(BlockGenerator):
11 | """
12 | Hash block generator.
13 | """
14 |
15 | def block(self, dataset, function_: Callable = None, property_: str = None,
16 | block: Block = None, block_black_list: BlockBlackList = None, base_on: Block = None):
17 | """
18 | The return of `property_` or `function_` should be string.
19 | """
20 | block = super()._block_args_check(function_, property_, block)
21 |
22 | if base_on:
23 | for block_id, dataset_id, record_id in base_on:
24 | if dataset.id == dataset_id:
25 | r = dataset.get_record(record_id)
26 | value = function_(r) if function_ else getattr(r, property_)
27 | if not isinstance(value, str):
28 | raise ValueError('Return of the function or property should be a string')
29 | value = block_id + '-' + value
30 | if block_black_list and block_black_list.has(value):
31 | continue
32 | block.add(value, dataset.id, r.id)
33 | if block_black_list:
34 | block_black_list.add(value, block)
35 |
36 | else:
37 | for r in dataset:
38 | value = function_(r) if function_ else getattr(r, property_)
39 | if not isinstance(value, str):
40 | raise ValueError('Return of the function or property should be a string')
41 | if block_black_list and block_black_list.has(value):
42 | continue
43 | block.add(value, dataset.id, r.id)
44 | if block_black_list:
45 | block_black_list.add(value, block)
46 |
47 | return block
48 |
49 | def generate(self, block1: Block, block2: Block, output_block: Block = None):
50 | output_block = super()._generate_args_check(output_block)
51 | for block_id, ds_id, record_id in block1:
52 | output_block.add(block_id, ds_id, record_id)
53 | for block_id, ds_id, record_id in block2:
54 | output_block.add(block_id, ds_id, record_id)
55 | return output_block
56 |
--------------------------------------------------------------------------------
/rltk/blocking/sorted_neighbourhood_block_generator.py:
--------------------------------------------------------------------------------
1 | from typing import Callable
2 | from functools import cmp_to_key
3 |
4 | from rltk.blocking.block_generator import BlockGenerator
5 | from rltk.blocking.block import Block
6 | from rltk.blocking.block_black_list import BlockBlackList
7 |
8 |
9 | class SortedNeighbourhoodBlockGenerator(BlockGenerator):
10 | """
11 | Sorted Neighbourhood Blocker.
12 |
13 | Args:
14 | window_size (int, optional): Window size. Defaults to 3.
15 | comparator (Callable, optional): Define how to compare two tokens t1 and t2.
16 | The signature is `comparator(t1: str, t2: str) -> int`.
17 | If return is 0, t1 equals t2; if return is -1, t1 is less than t2;
18 | if return is 1, t1 is greater than t2.
19 | Defaults to None, which uses Python's default string comparison.
20 | block_id_prefix (str, optional): The block id prefix of each block.
21 | Defaults to "sorted_neighbourhood_".
22 | """
23 | def __init__(self, window_size: int = 3, comparator: Callable = None, block_id_prefix='sorted_neighbourhood_'):
24 | if comparator is None:
25 | comparator = self._default_comparator
26 | self.window_size = window_size
27 | self.comparator = comparator
28 | self.block_id_prefix = block_id_prefix
29 |
30 | def block(self, dataset, function_: Callable = None, property_: str = None,
31 | block: Block = None, block_black_list: BlockBlackList = None, base_on: Block = None):
32 | """
33 | The return of `property_` or `function_` should be a vector (list).
34 | """
35 | block = super()._block_args_check(function_, property_, block)
36 |
37 | if base_on:
38 | for block_id, dataset_id, record_id in base_on:
39 | if dataset.id == dataset_id:
40 | r = dataset.get_record(record_id)
41 | value = function_(r) if function_ else getattr(r, property_)
42 | if not isinstance(value, (list, set)):
43 | value = value(set)
44 | for v in value:
45 | if not isinstance(v, str):
46 | raise ValueError('Elements in return list should be string')
47 | if block_black_list and block_black_list.has(v):
48 | continue
49 | v = block_id + '-' + v
50 | block.add(v, dataset.id, r.id)
51 | if block_black_list:
52 | block_black_list.add(v, block)
53 |
54 | else:
55 | for r in dataset:
56 | value = function_(r) if function_ else getattr(r, property_)
57 | if not isinstance(value, (list, set)):
58 | value = set(value)
59 | for v in value:
60 | if not isinstance(v, str):
61 | raise ValueError('Elements in return list should be string')
62 | if block_black_list and block_black_list.has(v):
63 | continue
64 | block.add(v, dataset.id, r.id)
65 | if block_black_list:
66 | block_black_list.add(v, block)
67 |
68 | return block
69 |
70 | def generate(self, block1: Block, block2: Block, output_block: Block = None):
71 | output_block = BlockGenerator._generate_args_check(output_block)
72 |
73 | # TODO: in-memory operations here, need to update
74 | # concatenation
75 | all_records = []
76 | for block_id, ds_id, record_id in block1:
77 | all_records.append((block_id, ds_id, record_id))
78 | for block_id, ds_id, record_id in block2:
79 | all_records.append((block_id, ds_id, record_id))
80 | sorted_all_records = sorted(all_records, key=cmp_to_key(self._comparator_wrapper))
81 |
82 | # apply slide window
83 | for i in range(len(sorted_all_records) - self.window_size + 1):
84 | block_id = self.block_id_prefix + str(i)
85 | for j in range(self.window_size):
86 | record = sorted_all_records[i + j]
87 | output_block.add(block_id, record[1], record[2])
88 |
89 | return output_block
90 |
91 | def _comparator_wrapper(self, t1, t2):
92 | return self.comparator(t1[0], t2[0])
93 |
94 | @staticmethod
95 | def _default_comparator(t1, t2):
96 | return 0 if t1 == t2 else (1 if t1 > t2 else -1)
97 |
--------------------------------------------------------------------------------
/rltk/blocking/token_block_generator.py:
--------------------------------------------------------------------------------
1 | from typing import Callable, TYPE_CHECKING
2 |
3 | if TYPE_CHECKING:
4 | from rltk.dataset import Dataset
5 | from rltk.blocking.block_generator import BlockGenerator
6 | from rltk.blocking.block import Block
7 | from rltk.blocking.block_black_list import BlockBlackList
8 |
9 |
10 | class TokenBlockGenerator(BlockGenerator):
11 | """
12 | Token block generator. The return for :meth:`block` should be a `list` or `set`.
13 | """
14 |
15 | def block(self, dataset, function_: Callable = None, property_: str = None,
16 | block: Block = None, block_black_list: BlockBlackList = None, base_on: Block = None):
17 | """
18 | The return of `property_` or `function_` should be list or set.
19 | """
20 | block = super()._block_args_check(function_, property_, block)
21 |
22 | if base_on:
23 | for block_id, dataset_id, record_id in base_on:
24 | if dataset.id == dataset_id:
25 | r = dataset.get_record(record_id)
26 | value = function_(r) if function_ else getattr(r, property_)
27 | if not isinstance(value, list) and not isinstance(value, set):
28 | raise ValueError('Return of the function or property should be a list')
29 | for v in value:
30 | if not isinstance(v, str):
31 | raise ValueError('Elements in return list should be string')
32 | if block_black_list and block_black_list.has(v):
33 | continue
34 | v = block_id + '-' + v
35 | block.add(v, dataset.id, r.id)
36 | if block_black_list:
37 | block_black_list.add(v, block)
38 |
39 | else:
40 | for r in dataset:
41 | value = function_(r) if function_ else getattr(r, property_)
42 | if not isinstance(value, list) and not isinstance(value, set):
43 | raise ValueError('Return of the function or property should be a list')
44 | for v in value:
45 | if not isinstance(v, str):
46 | raise ValueError('Elements in return list should be string')
47 | if block_black_list and block_black_list.has(v):
48 | continue
49 | block.add(v, dataset.id, r.id)
50 | if block_black_list:
51 | block_black_list.add(v, block)
52 |
53 | return block
54 |
55 | def generate(self, block1: Block, block2: Block, output_block: Block = None):
56 | output_block = super()._generate_args_check(output_block)
57 | for block_id, ds_id, record_id in block1:
58 | output_block.add(block_id, ds_id, record_id)
59 | for block_id, ds_id, record_id in block2:
60 | output_block.add(block_id, ds_id, record_id)
61 | return output_block
62 |
--------------------------------------------------------------------------------
/rltk/cli.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 |
4 | #: Accept all default values without asking user
5 | SLIENTLY_ACCEPT_ALL_DEFAULT_VALUES = False
6 |
7 |
8 | def prompt(text: str, *args, new_line: bool = True, **kwargs):
9 | """
10 | Prompt in terminal (stdout).
11 |
12 | Args:
13 | text (str): Text.
14 | *args: More text.
15 | new_line (bool, optional): End with a new line. Defaults to True.
16 | **kwargs: Other key word arguments used by :py:meth:`print`.
17 | """
18 | line_end = '\n' if new_line else ''
19 | print(text, *args, file=sys.stdout, end=line_end, **kwargs)
20 | sys.stdout.flush()
21 |
22 |
23 | def select(text: str, cases: list, default: int = None, case_sensitive: bool = False):
24 | """
25 | Let user select one of the cases.
26 |
27 | Args:
28 | text (str): Text.
29 | cases (list[tuple]): Cases, should be list of tuples. Each tuple is in form `('display text', 'user's input')`.
30 | For example, `[('(Y)es, 'y'), ('(N)o', 'n')]`.
31 | default (int, optional): Default case index in `cases`. Empty or space is treated as default case.
32 | None means no default case. Defaults to None.
33 | case_sensitive (bool, optional): If user's input is case sensitive, defaults to False.
34 |
35 | Returns:
36 | str: User's input.
37 | """
38 | prompt(text)
39 | case_text = []
40 | for idx, c in enumerate(cases):
41 | if default is not None and idx == default:
42 | case_text.append('[{}]'.format(c[0]))
43 | else:
44 | case_text.append('{}'.format(c[0]))
45 | prompt(' / '.join(case_text))
46 | valid_cases = [c[1] for c in cases]
47 | if default is not None:
48 | valid_cases.append('')
49 | if not case_sensitive:
50 | valid_cases = list(map(lambda x: x.lower(), valid_cases))
51 |
52 | while True:
53 | user_input = ''
54 | if not SLIENTLY_ACCEPT_ALL_DEFAULT_VALUES or default is None:
55 | user_input = input().strip()
56 | if not case_sensitive:
57 | user_input = user_input.lower()
58 | if user_input not in valid_cases:
59 | prompt('Invalid input, please retry')
60 | continue
61 |
62 | if user_input == '' and default is not None:
63 | return cases[default][1]
64 | return user_input
65 |
66 |
67 | def confirm(text: str, default: bool = None):
68 | """
69 | Let user choose Yes or No.
70 |
71 | Args:
72 | text (str): Text.
73 | default (bool, optional): True sets Yes as default case, False sets No. None means no default case.
74 | Defaults to None.
75 |
76 | Returns:
77 | bool: True means Yes, False means No.
78 | """
79 | if default is not None:
80 | default = 0 if default else 1
81 | return select(text, cases=[('(Y)es', 'y',), ('(N)o', 'n')], default=default, case_sensitive=False) == 'y'
82 |
83 |
84 | class Progress(object):
85 | """
86 | Progress status.
87 |
88 | Args:
89 | format_ (str, optional): Format of text.
90 | start (str, optional): Text while starting.
91 | end (str, optional): Text while ending.
92 |
93 | Note:
94 |
95 | Please use in `with` statement::
96 |
97 | with rltk.cli.progress(format_='{}%') as p:
98 | for i in range(11):
99 | time.sleep(0.5)
100 | p.update(i * 10)
101 |
102 | """
103 |
104 | def __init__(self, format_: str = '{}', start: str = 'Starting...', end: str = 'Done!'):
105 | self._format = format_
106 | self._prev_len = 0
107 | self._start = start
108 | self._end = end
109 |
110 | def update(self, *args):
111 | """
112 | Update progress.
113 |
114 | Args:
115 | *args: Arguments which will be formatted by `format_`.
116 | """
117 | text = self._format.format(*args)
118 |
119 | # clean up
120 | prompt('\r' + ' ' * self._prev_len, new_line=False)
121 |
122 | # overwrite
123 | prompt('\r' + text, new_line=False)
124 | self._prev_len = len(text)
125 |
126 | def __enter__(self):
127 | """
128 | Start prompt.
129 | """
130 | if self._start:
131 | prompt(self._start, new_line=False)
132 | return self
133 |
134 | def __exit__(self, exc_type, exc_val, exc_tb):
135 | """
136 | End prompt.
137 | """
138 | # clean up
139 | prompt('\r' + ' ' * self._prev_len, new_line=False)
140 |
141 | if self._end:
142 | prompt('\r' + self._end, new_line=False)
143 |
144 | # new line
145 | prompt('')
146 |
147 |
148 | progress = Progress
149 |
150 |
151 | def input_(text: str, default: str = None, type_: type = str):
152 | """
153 | Input.
154 |
155 | Args:
156 | text (str): Text.
157 | default (str, optional): Default value. Defaults to None which means no default value.
158 | type_ (type, optional): Type of input value, defaults to `str`.
159 |
160 | Returns:
161 | object: User input in type `type_`.
162 |
163 | Note:
164 | Make sure default value can be converted by `type_`, otherwise exception will be raised.
165 | """
166 | prompt(text)
167 |
168 | while True:
169 | if not SLIENTLY_ACCEPT_ALL_DEFAULT_VALUES or default is None:
170 | user_input = input().strip()
171 | try:
172 | return type_(user_input)
173 | except:
174 | prompt('Invalid input, please retry')
175 | else:
176 | return type_(default)
177 |
178 |
--------------------------------------------------------------------------------
/rltk/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | from rltk.evaluation.evaluation import Evaluation
2 | from rltk.evaluation.ground_truth import GroundTruth
3 | from rltk.evaluation.trial import Trial
4 |
--------------------------------------------------------------------------------
/rltk/io/__init__.py:
--------------------------------------------------------------------------------
1 | from rltk.io.reader import *
2 | from rltk.io.writer import *
3 | from rltk.io.adapter import *
4 |
--------------------------------------------------------------------------------
/rltk/io/adapter/__init__.py:
--------------------------------------------------------------------------------
1 | from rltk.io.adapter.key_value_adapter import KeyValueAdapter
2 | from rltk.io.adapter.memory_key_value_adapter import MemoryKeyValueAdapter
3 | from rltk.io.adapter.dbm_key_value_adapter import DbmKeyValueAdapter
4 | from rltk.io.adapter.redis_key_value_adapter import RedisKeyValueAdapter
5 | from rltk.io.adapter.hbase_key_value_adapter import HBaseKeyValueAdapter
6 |
7 | from rltk.io.adapter.key_set_adapter import KeySetAdapter
8 | from rltk.io.adapter.memory_key_set_adapter import MemoryKeySetAdapter
9 | from rltk.io.adapter.redis_key_set_adapter import RedisKeySetAdapter
10 | from rltk.io.adapter.leveldb_key_set_adapter import LevelDbKeySetAdapter
11 |
--------------------------------------------------------------------------------
/rltk/io/adapter/dbm_key_value_adapter.py:
--------------------------------------------------------------------------------
1 | import dbm
2 |
3 | from rltk.io.adapter import KeyValueAdapter
4 | from rltk.io.serializer import Serializer, PickleSerializer
5 |
6 |
7 | class DbmKeyValueAdapter(KeyValueAdapter):
8 | """
9 | Python builtin `DBM `_ adapter.
10 |
11 | Args:
12 | filename (str): DBM file name.
13 | dbm_class (dbm): The value can be `dbm.gnu`, `dbm.ndbm` or `dbm.dumb`.
14 | serializer (Serializer, optional): The serializer used to serialize Record object.
15 | If it's None, `PickleSerializer` will be used. Defaults to None.
16 | clean (bool, optional): Clean adapters while starting. Defaults to False.
17 |
18 | Note:
19 | Performance drops when dataset is large.
20 | """
21 | def __init__(self, filename, dbm_class=dbm.ndbm, serializer: Serializer = None, clean: bool = False):
22 | if not serializer:
23 | serializer = PickleSerializer()
24 | self._db = dbm_class.open(filename, 'c')
25 | self._serializer = serializer
26 |
27 | if clean:
28 | self.clean()
29 |
30 | def get(self, key):
31 | v = self._db.get(key, None)
32 | if not v:
33 | return
34 | return self._serializer.loads(v)
35 |
36 | def set(self, key, value):
37 | self._db[key] = self._serializer.dumps(value)
38 |
39 | def __next__(self):
40 | for k in self._db.keys():
41 | key = k.decode('utf-8')
42 | yield key, self.get(key)
43 |
44 | def delete(self, key):
45 | del self._db[key]
46 |
47 | def close(self):
48 | self._db.close()
49 |
--------------------------------------------------------------------------------
/rltk/io/adapter/hbase_key_value_adapter.py:
--------------------------------------------------------------------------------
1 | from rltk.record import Record
2 | from rltk.io.adapter import KeyValueAdapter
3 | from rltk.io.serializer import Serializer, PickleSerializer
4 | from rltk.utils import module_importer
5 |
6 |
7 | happybase = module_importer('happybase', 'happybase>=1.1.0')
8 |
9 |
10 | class HBaseKeyValueAdapter(KeyValueAdapter):
11 | """
12 | Hbase Adapter.
13 |
14 | Args:
15 | host (str): Host address.
16 | table (str): HBase table name.
17 | serializer (Serializer, optional): The serializer used to serialize Record object.
18 | If it's None, `PickleSerializer` will be used. Defaults to None.
19 | key_prefix (str, optional): The prefix of HBase row key.
20 | clean (bool, optional): Clean adapters while starting. Defaults to False.
21 | **kwargs: Other parameters used by `happybase.Connection `_ .
22 |
23 | Note:
24 | The timeout of thrift in hbase-site.xml needs to increase::
25 |
26 |
27 | hbase.thrift.server.socket.read.timeout
28 | 6000000
29 |
30 |
31 | hbase.thrift.connection.max-idletime
32 | 18000000
33 |
34 | """
35 |
36 | def __init__(self, host, table, serializer: Serializer = None, key_prefix: str = '', clean: bool = False, **kwargs):
37 | if not serializer:
38 | serializer = PickleSerializer()
39 | self._conn = happybase().Connection(host=host, timeout=None, **kwargs)
40 | self._serializer = serializer
41 | self._key_prefix = key_prefix
42 | self._family_name = 'rltk'
43 | self._col_name = 'obj'
44 | self._fam_col_name = '{}:{}'.format(self._family_name, self._col_name).encode('utf-8')
45 |
46 | if table.encode('utf-8') not in self._conn.tables():
47 | self._create_table(table)
48 | self._table = self._conn.table(table)
49 |
50 | if clean:
51 | self.clean()
52 |
53 | #: parallel-safe
54 | parallel_safe = True
55 |
56 | def _encode_key(self, key):
57 | happybase.Connection('asd')
58 | return '{prefix}{key}'.format(prefix=self._key_prefix, key=key).encode('utf-8')
59 |
60 | def _decode_key(self, key):
61 | key = key.decode('utf-8')
62 | return key[len(self._key_prefix):]
63 |
64 | def close(self):
65 | try:
66 | self._conn.close()
67 | except:
68 | pass
69 |
70 | def _create_table(self, table_name):
71 | self._conn.create_table(table_name, {self._family_name: dict()})
72 |
73 | def get(self, key) -> object:
74 | return self._serializer.loads(self._table.row(self._encode_key(key))[self._fam_col_name])
75 |
76 | def set(self, key, value: object):
77 | return self._table.put(self._encode_key(key), {self._fam_col_name: self._serializer.dumps(value)})
78 |
79 | def delete(self, key):
80 | return self._table.delete(self._encode_key(key))
81 |
82 | def __next__(self):
83 | for key, data in self._table.scan(
84 | row_prefix=self._key_prefix.encode('utf-8'), filter=b'FirstKeyOnlyFilter()'):
85 | yield self._decode_key(key), self._serializer.loads(data[self._fam_col_name])
86 |
--------------------------------------------------------------------------------
/rltk/io/adapter/key_set_adapter.py:
--------------------------------------------------------------------------------
1 | import json
2 | import io
3 |
4 |
5 | class KeySetAdapter(object):
6 | """
7 | Key Set Adapter.
8 | """
9 |
10 | def get(self, key: str):
11 | """
12 | Get value by key.
13 |
14 | Args:
15 | key (str): Key.
16 |
17 | Returns:
18 | set: A set of values, None if key doesn't exist.
19 | """
20 | raise NotImplementedError
21 |
22 | def set(self, key: str, value: set):
23 | """
24 | Set a set by key.
25 |
26 | Args:
27 | key (str): Key.
28 | value (builtins.set): Value set.
29 | """
30 | raise NotImplementedError
31 |
32 | def add(self, key: str, value: object):
33 | """
34 | Add value to a set by key. If key doesn't exist, create one.
35 |
36 | Args:
37 | key (str): Key.
38 | value (object): Value.
39 | """
40 | raise NotImplementedError
41 |
42 | def remove(self, key: str, value: object):
43 | """
44 | Remove value from a set by key. If key doesn't exist, create one.
45 |
46 | Args:
47 | key (str): Key.
48 | value (object): Value.
49 | """
50 | raise NotImplementedError
51 |
52 | def delete(self, key: str):
53 | """
54 | Delete a set by key.
55 |
56 | Args:
57 | key (str): Key.
58 | """
59 | raise NotImplementedError
60 |
61 | def dump(self, f: io.IOBase):
62 | """
63 | Dump data to json lines format. Each json object is formatted as `{key: [value1, value2, ...]}`.
64 |
65 | Args:
66 | f (io.IOBase): IO handler.
67 | """
68 | for k, ss in self:
69 | obj = {k: list(ss)}
70 | f.write(json.dumps(obj) + '\n')
71 |
72 | def clean(self):
73 | """
74 | Delete all keys in this adapter.
75 | """
76 | for k, _ in self:
77 | self.delete(k)
78 |
79 | def __init__(self):
80 | pass
81 |
82 | def __del__(self):
83 | """
84 | Same as :meth:`close`.
85 | """
86 | self.close()
87 |
88 | def __iter__(self):
89 | """
90 | Same as :meth:`__next__`.
91 | """
92 | return self.__next__()
93 |
94 | def __next__(self):
95 | """
96 | Iterator of the data store. This is not required.
97 |
98 | Returns:
99 | iter: key, set
100 | """
101 | pass
102 |
103 | def close(self):
104 | """
105 | Close handler if needed.
106 | """
107 | pass
108 |
--------------------------------------------------------------------------------
/rltk/io/adapter/key_value_adapter.py:
--------------------------------------------------------------------------------
1 | class KeyValueAdapter(object):
2 | """
3 | Super class of adapter of key value stores.
4 | """
5 | def __init__(self):
6 | pass
7 |
8 | def __del__(self):
9 | """
10 | Same as :meth:`close`.
11 | """
12 | self.close()
13 |
14 | #: If this adapter is parallel-safe. Defaults to False if it's not overwritten in concrete class.
15 | parallel_safe = False
16 |
17 | def get(self, key: str) -> object:
18 | """
19 | Get value.
20 |
21 | Args:
22 | key (str): Key.
23 |
24 | Returns:
25 | object:
26 | """
27 | raise NotImplementedError
28 |
29 | def set(self, key: str, value: object):
30 | """
31 | Set value.
32 |
33 | Args:
34 | key (str): Key.
35 | value (object): Value.
36 | """
37 | raise NotImplementedError
38 |
39 | def delete(self, key):
40 | """
41 | Delete value.
42 |
43 | Args:
44 | key (str): Key.
45 | """
46 | raise NotImplementedError
47 |
48 | def clean(self):
49 | """
50 | Delete all keys in adapter.
51 | """
52 | for key, _ in self:
53 | self.delete(key)
54 |
55 | def __iter__(self):
56 | """
57 | Same as :meth:`__next__`.
58 | """
59 | return self.__next__()
60 |
61 | def __next__(self):
62 | """
63 | Iterator of the data store. This is not required.
64 |
65 | Returns:
66 | iter: key, value
67 | """
68 | pass
69 |
70 | def close(self):
71 | """
72 | Close handler if needed.
73 | """
74 | pass
--------------------------------------------------------------------------------
/rltk/io/adapter/leveldb_key_set_adapter.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from rltk.io.serializer import Serializer, PickleSerializer
4 | from rltk.io.adapter.key_set_adapter import KeySetAdapter
5 | from rltk.utils import module_importer
6 |
7 |
8 | plyvel = module_importer('plyvel', 'plyvel>=1.0.5', '''
9 | Please install LevelDB's system level package first: https://github.com/google/leveldb .
10 |
11 | If you are using Mac and installed LevelDB by HomeBrew,
12 | please make sure that `plyvel` refers to correct library file while installing:
13 |
14 | pip uninstall plyvel
15 | CFLAGS='-mmacosx-version-min=10.7 -stdlib=libc++' pip install --no-cache-dir plyvel
16 | ''')
17 |
18 |
19 | class LevelDbKeySetAdapter(KeySetAdapter):
20 | """
21 | `LevelDB `_ key set adapter.
22 | LevelDB is a serverless, stand-alone key value store. It can be used as a local file system store.
23 |
24 |
25 | Args:
26 | path (str): The directory path used by LevelDB.
27 | name (str): Because LevelDB only has a single key space, \
28 | this is used as name space.
29 | serializer (Serializer, optional): The serializer used to serialize each object in set.
30 | If it's None, `PickleSerializer` will be used. Defaults to None.
31 | clean (bool, optional): Clean adapters while starting. Defaults to False.
32 | kwargs: Other key word arguments for `plyvel.DB `_.
33 |
34 | Note:
35 | A particular LevelDB database only supports accessing by one process at one time.
36 | This adapter uses singleton (in one RLTK instance) to make sure only one `plyvel.DB` is created.
37 | Different `name` s can be used if you don't want to create multiple databases.
38 | """
39 | _db_instance = None
40 | _db_ref_count = 0
41 |
42 | def __init__(self, path: str, name: str, serializer: Serializer = None, clean: bool = False, **kwargs):
43 | if not serializer:
44 | serializer = PickleSerializer()
45 |
46 | # leveldb's connection can only be a singleton
47 | if not self.__class__._db_instance:
48 | if not os.path.exists(path):
49 | os.mkdir(path)
50 | self.__class__._db_instance = plyvel().DB(path, create_if_missing=True, **kwargs)
51 | self._db = self.__class__._db_instance
52 | self.__class__._db_ref_count += 1
53 |
54 | self._prefix = '{name}_'.format(name=name)
55 | self._prefix_db = self._db.prefixed_db(self._encode(self._prefix))
56 | self._serializer = serializer
57 |
58 | if clean:
59 | self.clean()
60 |
61 | @staticmethod
62 | def _encode(string):
63 | return string.encode(encoding='utf-8')
64 |
65 | @staticmethod
66 | def _decode(bytes_):
67 | return bytes_.decode(encoding='utf-8')
68 |
69 | def _get(self, key):
70 | v = self._prefix_db.get(key)
71 | if not v:
72 | return
73 | return self._serializer.loads(v)
74 |
75 | def get(self, key):
76 | return self._get(self._encode(key))
77 |
78 | def set(self, key, value):
79 | if not isinstance(value, set):
80 | raise ValueError('value must be a set')
81 | self.delete(key)
82 | self._prefix_db.put(self._encode(key), self._serializer.dumps(value))
83 |
84 | def add(self, key, value):
85 | set_ = self.get(key)
86 | if not set_:
87 | set_ = set([])
88 | set_.add(value)
89 | return self.set(key, set_)
90 |
91 | def remove(self, key, value):
92 | set_ = self.get(key)
93 | if not set_:
94 | return
95 | set_.remove(value)
96 | return self.set(key, set_)
97 |
98 | def delete(self, key):
99 | return self._prefix_db.delete(self._encode(key))
100 |
101 | def __next__(self):
102 | for key in self._prefix_db.iterator(include_value=False):
103 | yield self._decode(key), self._get(key)
104 |
105 | def close(self):
106 | self.__class__._db_ref_count -= 1
107 | if self.__class__._db_ref_count == 0:
108 | self._db.close()
109 |
--------------------------------------------------------------------------------
/rltk/io/adapter/memory_key_set_adapter.py:
--------------------------------------------------------------------------------
1 | from rltk.io.adapter.key_set_adapter import KeySetAdapter
2 |
3 |
4 | class MemoryKeySetAdapter(KeySetAdapter):
5 | """
6 | Memory key set a adapter.
7 | """
8 |
9 | def __init__(self):
10 | self._store = dict()
11 |
12 | def get(self, key):
13 | return self._store.get(key)
14 |
15 | def set(self, key, value):
16 | if not isinstance(value, set):
17 | raise ValueError('value must be a set')
18 | self._store[key] = value
19 |
20 | def add(self, key, value):
21 | if key not in self._store:
22 | self._store[key] = set()
23 | self._store[key].add(value)
24 |
25 | def remove(self, key, value):
26 | self._store[key].remove(value)
27 |
28 | def delete(self, key):
29 | del self._store[key]
30 |
31 | def clean(self):
32 | self._store = dict()
33 |
34 | def __next__(self):
35 | for k, v in self._store.items():
36 | yield k, v
37 |
--------------------------------------------------------------------------------
/rltk/io/adapter/memory_key_value_adapter.py:
--------------------------------------------------------------------------------
1 | from rltk.io.adapter import KeyValueAdapter
2 |
3 |
4 | class MemoryKeyValueAdapter(KeyValueAdapter):
5 | """
6 | Basic in-memory (dict) adapter.
7 | """
8 | def __init__(self):
9 | self._dict = dict()
10 |
11 | def get(self, key):
12 | return self._dict.get(key)
13 |
14 | def set(self, key, value: object):
15 | self._dict[key] = value
16 |
17 | def __next__(self):
18 | for key, value in self._dict.items():
19 | yield key, value
20 |
21 | def delete(self, key):
22 | del self._dict[key]
23 |
24 | def clean(self):
25 | self._dict = dict()
26 |
--------------------------------------------------------------------------------
/rltk/io/adapter/redis_key_set_adapter.py:
--------------------------------------------------------------------------------
1 | from rltk.io.serializer import Serializer, PickleSerializer
2 | from rltk.io.adapter.key_set_adapter import KeySetAdapter
3 | from rltk.utils import module_importer
4 |
5 |
6 | redis = module_importer('redis', 'redis>=2.0.0')
7 |
8 |
9 | class RedisKeySetAdapter(KeySetAdapter):
10 | """
11 | Redis key set adapter.
12 |
13 | Args:
14 | host (str): Host address.
15 | serializer (Serializer, optional): The serializer used to serialize Record object.
16 | If it's None, `PickleSerializer` will be used. Defaults to None.
17 | key_prefix (str, optional): Prefix of key in redis. Defaults to empty string.
18 | clean (bool, optional): Clean adapters while starting. Defaults to False.
19 | **kwargs: Other parameters used by `redis.Redis `_ .
20 | """
21 |
22 | def __init__(self, host, key_prefix: str = '', serializer: Serializer=None, clean: bool = False, **kwargs):
23 | if not serializer:
24 | serializer = PickleSerializer()
25 | self._redis = redis().Redis(host=host, **kwargs)
26 | self._serializer = serializer
27 | self._key_prefix = key_prefix
28 |
29 | if clean:
30 | self.clean()
31 |
32 | def _encode_key(self, key):
33 | return '{prefix}{key}'.format(prefix=self._key_prefix, key=key)
34 |
35 | def _decode_key(self, key):
36 | key = key.decode('utf-8')
37 | return key[len(self._key_prefix):]
38 |
39 | def get(self, key):
40 | return self._get(self._encode_key(key))
41 |
42 | def _get(self, key):
43 | v = set([self._serializer.loads(v) for v in self._redis.smembers(key)])
44 | if len(v) != 0:
45 | return v
46 |
47 | def set(self, key, value):
48 | if not isinstance(value, set):
49 | raise ValueError('value must be a set')
50 | self.delete(key)
51 | for v in value:
52 | self.add(key, v)
53 |
54 | def add(self, key, value):
55 | return self._redis.sadd(self._encode_key(key), self._serializer.dumps(value))
56 |
57 | def remove(self, key, value):
58 | return self._redis.srem(self._encode_key(key), self._serializer.dumps(value))
59 |
60 | def delete(self, key):
61 | return self._redis.delete(self._encode_key(key))
62 |
63 | def __next__(self):
64 | # scan_iter() returns generator, keys() returns array
65 | for key in self._redis.scan_iter(self._encode_key('*')):
66 | yield self._decode_key(key), self._get(key)
67 |
--------------------------------------------------------------------------------
/rltk/io/adapter/redis_key_value_adapter.py:
--------------------------------------------------------------------------------
1 | from rltk.record import Record
2 | from rltk.io.adapter import KeyValueAdapter
3 | from rltk.io.serializer import Serializer, PickleSerializer
4 | from rltk.utils import module_importer
5 |
6 |
7 | redis = module_importer('redis', 'redis>=2.0.0')
8 |
9 |
10 | class RedisKeyValueAdapter(KeyValueAdapter):
11 | """
12 | Redis adapter.
13 |
14 | Args:
15 | host (str): Host address.
16 | serializer (Serializer, optional): The serializer used to serialize Record object.
17 | If it's None, `PickleSerializer` will be used. Defaults to None.
18 | key_prefix (str, optional): Prefix of key in redis. Defaults to empty string.
19 | clean (bool, optional): Clean adapters while starting. Defaults to False.
20 | **kwargs: Other parameters used by `redis.Redis `_ .
21 | """
22 | def __init__(self, host, serializer: Serializer=None, key_prefix: str = '', clean: bool = False, **kwargs):
23 | if not serializer:
24 | serializer = PickleSerializer()
25 | self._redis = redis().Redis(host=host, **kwargs)
26 | self._serializer = serializer
27 | self._key_prefix = key_prefix
28 |
29 | if clean:
30 | self.clean()
31 |
32 | #: parallel-safe
33 | parallel_safe = True
34 |
35 | def _encode_key(self, key):
36 | return self._key_prefix + key
37 |
38 | def _decode_key(self, key):
39 | key = key.decode('utf-8')
40 | return key[len(self._key_prefix):]
41 |
42 | def get(self, key) -> object:
43 | v = self._redis.get(self._encode_key(key))
44 | if v:
45 | return self._serializer.loads(v)
46 |
47 | def set(self, key, value: object):
48 | return self._redis.set(self._encode_key(key), self._serializer.dumps(value))
49 |
50 | def delete(self, key):
51 | return self._redis.delete(self._encode_key(key))
52 |
53 | def __next__(self):
54 | # scan_iter() returns generator, keys() returns array
55 | for key in self._redis.scan_iter(self._encode_key('*')):
56 | yield self._decode_key(key), self._serializer.loads(self._redis.get(key))
57 |
--------------------------------------------------------------------------------
/rltk/io/io_utils.py:
--------------------------------------------------------------------------------
1 | import io
2 |
3 |
4 | def get_file_handler(f, mode='r'):
5 | """
6 | Helper function for getting file handler.
7 |
8 | Args:
9 | f (Union[str,io.IOBase]): File path or handler.
10 | mode (str, optional): Parameter mode in :py:meth:`open`. Defaults to `r`.
11 |
12 | Returns:
13 | io.IOBase: File handler.
14 | """
15 | if isinstance(f, io.IOBase):
16 | return f
17 |
18 | return open(f, mode)
19 |
--------------------------------------------------------------------------------
/rltk/io/reader/__init__.py:
--------------------------------------------------------------------------------
1 | from rltk.io.reader.reader import Reader
2 | from rltk.io.reader.array_reader import ArrayReader
3 | from rltk.io.reader.dataframe_reader import DataFrameReader
4 | from rltk.io.reader.csv_reader import CSVReader
5 | from rltk.io.reader.jsonlines_reader import JsonLinesReader
6 | from rltk.io.reader.ground_truth_reader import GroundTruthReader
7 |
--------------------------------------------------------------------------------
/rltk/io/reader/array_reader.py:
--------------------------------------------------------------------------------
1 | from rltk.io.reader import Reader
2 |
3 |
4 | class ArrayReader(Reader):
5 | """
6 | Array Reader.
7 |
8 | Args:
9 | array (list): Array.
10 | """
11 |
12 | def __init__(self, array):
13 | try:
14 | for _ in array:
15 | break
16 | except TypeError:
17 | raise TypeError('Can not iterate on ArrayReader')
18 |
19 | self._array = array
20 |
21 | def __next__(self):
22 | for item in self._array:
23 | yield item
24 |
25 |
--------------------------------------------------------------------------------
/rltk/io/reader/csv_reader.py:
--------------------------------------------------------------------------------
1 | import csv
2 |
3 | from rltk.io.reader import Reader
4 | from rltk.io.io_utils import get_file_handler
5 |
6 |
7 | class CSVReader(Reader):
8 | """
9 | CSV reader.
10 |
11 | Args:
12 | file_handler (str/io.IOBase): File name or file handler of input file.
13 | **kwargs: Other parameters used by `csv.DictReader `_ .
14 | """
15 |
16 | def __init__(self, file_handler, **kwargs):
17 | self._file_handler = get_file_handler(file_handler)
18 | self._csv_reader = csv.DictReader(self._file_handler, **kwargs)
19 |
20 | def __next__(self):
21 | for obj in self._csv_reader:
22 | yield {t[0]: t[1] for t in obj.items()}
23 |
24 | def close(self):
25 | try:
26 | self._file_handler.close()
27 | except:
28 | pass
29 |
--------------------------------------------------------------------------------
/rltk/io/reader/dataframe_reader.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 | from rltk.io.reader import Reader
4 |
5 |
6 | class DataFrameReader(Reader):
7 | """
8 | Pandas DataFrame Reader.
9 |
10 | Args:
11 | df (pandas.DataFrame): DataFrame.
12 | keep_dataframe_default_index (bool, optional): If True, add a key `dataframe_default_index` holding the \
13 | original index in Dataframe. Defaults to False.
14 | """
15 |
16 | def __init__(self, df: pd.DataFrame, keep_dataframe_default_index: bool = False):
17 | self._df = df
18 | self._keep_dataframe_default_index = keep_dataframe_default_index
19 |
20 | def __next__(self):
21 | if self._keep_dataframe_default_index:
22 | for i, item in self._df.iterrows():
23 | yield dict(item.to_dict(), dataframe_default_index=i)
24 | else:
25 | for _, item in self._df.iterrows():
26 | yield item.to_dict()
27 |
--------------------------------------------------------------------------------
/rltk/io/reader/ground_truth_reader.py:
--------------------------------------------------------------------------------
1 | import csv
2 |
3 | from rltk.io.reader import Reader
4 | from rltk.io.io_utils import get_file_handler
5 |
6 |
7 | class GroundTruthReader(Reader):
8 | """
9 | Ground truth reader. Currently, ground truth stores in CSV format.
10 |
11 | Args:
12 | file_handler (str/io.IOBase): File name or file handler of input file.
13 | **kwargs: Other parameters used by `csv.DictReader `_ .
14 | """
15 |
16 | def __init__(self, file_handler, **kwargs):
17 | self._file_handler = get_file_handler(file_handler)
18 | self._csv_reader = csv.DictReader(self._file_handler, **kwargs)
19 |
20 | def __next__(self):
21 | for obj in self._csv_reader:
22 | yield {t[0]: t[1] for t in obj.items()}
23 |
24 | def close(self):
25 | try:
26 | self._file_handler.close()
27 | except:
28 | pass
29 |
--------------------------------------------------------------------------------
/rltk/io/reader/jsonlines_reader.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 |
4 | from rltk.io.reader import Reader
5 | from rltk.io.io_utils import get_file_handler
6 |
7 |
8 | class JsonLinesReader(Reader):
9 | """
10 | `JSON Lines `_ Reader.
11 |
12 | Args:
13 | file_handler (str/io.IOBase): File name or file handler of input file.
14 | ignore_blank_line (bool): If blank line should be ignored. Defaults to True.
15 | """
16 |
17 | def __init__(self, file_handler, ignore_blank_line=True):
18 | self._file_handler = get_file_handler(file_handler)
19 | self._ignore_blank_line = ignore_blank_line
20 |
21 | def __next__(self):
22 | for line in self._file_handler:
23 | if line.strip() == '':
24 | if self._ignore_blank_line:
25 | continue
26 | else:
27 | raise ValueError('Blank line detected')
28 | yield json.loads(line)
29 |
30 | def close(self):
31 | try:
32 | self._file_handler.close()
33 | except:
34 | pass
35 |
--------------------------------------------------------------------------------
/rltk/io/reader/reader.py:
--------------------------------------------------------------------------------
1 | import io
2 |
3 |
4 | class Reader(object):
5 | """
6 | Reader.
7 | """
8 |
9 | def __init__(self):
10 | pass
11 |
12 | def __iter__(self):
13 | """
14 | Same as :meth:`__next__`.
15 | """
16 | return self.__next__()
17 |
18 | def __next__(self):
19 | """
20 | Iterator.
21 |
22 | Returns:
23 | iter: `raw_object`. The raw_object is a dict which represents raw data of a logical row.
24 | """
25 | raise NotImplementedError
26 |
27 | def __del__(self):
28 | """
29 | Same as :meth:`close`
30 | """
31 |
32 | def close(self):
33 | """
34 | Close handler.
35 | """
36 | pass
37 |
--------------------------------------------------------------------------------
/rltk/io/serializer/__init__.py:
--------------------------------------------------------------------------------
1 | from rltk.io.serializer.serializer import Serializer
2 | from rltk.io.serializer.pickle_serializer import PickleSerializer
3 |
--------------------------------------------------------------------------------
/rltk/io/serializer/pickle_serializer.py:
--------------------------------------------------------------------------------
1 | import pickle
2 |
3 | from rltk.io.serializer import Serializer
4 |
5 |
6 | class PickleSerializer(Serializer):
7 | """
8 | `Pickle serializer `_ .
9 | """
10 |
11 | def loads(self, string):
12 | return pickle.loads(string)
13 |
14 | def dumps(self, obj):
15 | return pickle.dumps(obj)
16 |
--------------------------------------------------------------------------------
/rltk/io/serializer/serializer.py:
--------------------------------------------------------------------------------
1 | class Serializer(object):
2 | """
3 | Serialize and deserialize object. This is the super class.
4 | """
5 |
6 | def loads(self, obj):
7 | """
8 | Load a serialized object.
9 |
10 | Args:
11 | obj (obj): For most of times, it's byte or string.
12 |
13 | Returns:
14 | obj: Python object.
15 | """
16 | raise NotImplementedError
17 |
18 | def dumps(self, obj):
19 | """
20 | Serialize the given object.
21 |
22 | Args:
23 | obj (obj): Python object.
24 |
25 | Returns:
26 | obj: Serialized object.
27 | """
28 | raise NotImplementedError
29 |
--------------------------------------------------------------------------------
/rltk/io/writer/__init__.py:
--------------------------------------------------------------------------------
1 | from rltk.io.writer.writer import Writer
2 | from rltk.io.writer.ground_truth_writer import GroundTruthWriter
3 |
--------------------------------------------------------------------------------
/rltk/io/writer/ground_truth_writer.py:
--------------------------------------------------------------------------------
1 | import csv
2 |
3 | from rltk.io.writer import Writer
4 | from rltk.io.io_utils import get_file_handler
5 |
6 |
7 | class GroundTruthWriter(Writer):
8 | """
9 | Ground truth writer.
10 |
11 | Args:
12 | file_handler (io.IOBase): It can be file name or file handler.
13 | """
14 | def __init__(self, file_handler):
15 | self._file_handler = get_file_handler(file_handler, 'w')
16 | fieldnames = ['id1', 'id2', 'label']
17 | self._csv_writer = csv.DictWriter(self._file_handler, fieldnames=fieldnames)
18 | self._csv_writer.writeheader()
19 |
20 | def write(self, id1: str, id2: str, label: bool):
21 | """
22 | Writer to file.
23 |
24 | Args:
25 | id1 (str): Id 1.
26 | id2 (str): Id 2.
27 | label (bool): Positive (True) or negative (False).
28 | """
29 | self._csv_writer.writerow({'id1': id1, 'id2': id2, 'label': label})
30 |
31 | def close(self):
32 | try:
33 | self._file_handler.close()
34 | except:
35 | pass
36 |
--------------------------------------------------------------------------------
/rltk/io/writer/writer.py:
--------------------------------------------------------------------------------
1 | import io
2 |
3 |
4 | class Writer(object):
5 | """
6 | Writer.
7 | """
8 | def __init__(self):
9 | pass
10 |
11 | def write(self):
12 | """
13 | Write content.
14 | """
15 | raise NotImplementedError
16 |
17 | def __del__(self):
18 | """
19 | Same to :meth:`close`.
20 | """
21 | self.close()
22 |
23 | def close(self):
24 | """
25 | Close handler.
26 | """
27 | pass
28 |
--------------------------------------------------------------------------------
/rltk/record.py:
--------------------------------------------------------------------------------
1 | import re
2 | from typing import Callable
3 |
4 |
5 | # Record ID should be string
6 | re_record_id = re.compile(r'^[^*]{1,255}$')
7 | # Valid python's property name
8 | # https://docs.python.org/3.6/reference/lexical_analysis.html#identifiers
9 | re_valid_property_name = re.compile(r'^[A-Za-z_]{1}[\w]*$')
10 |
11 |
12 | class Record(object):
13 | """
14 | Record representation. Properties should be defined for further usage.
15 |
16 | Args:
17 | raw_object (dict): Raw data which will be used to create properties.
18 | """
19 |
20 | _remove_raw_object = False
21 |
22 | def __init__(self, raw_object):
23 | self.raw_object = raw_object
24 |
25 | @property
26 | def id(self):
27 | """
28 | Required property. Type has to be utf-8 string.
29 | """
30 | raise NotImplementedError
31 |
32 | def __eq__(self, other):
33 | """
34 | Only if both instances have the same class and id.
35 |
36 | Returns:
37 | bool: Equal or not.
38 | """
39 | if not isinstance(other, self.__class__): # class should be exactly the same
40 | return False
41 | return self.id == other.id
42 |
43 |
44 | class cached_property(property):
45 | """
46 | Decorator.
47 | If a Record property is decorated, the final value of it will be pre-calculated.
48 | """
49 | def __init__(self, func):
50 | self.func = func
51 |
52 | def __get__(self, obj, cls):
53 | """
54 | Args:
55 | obj (object): Record instance
56 | cls (class): Record class
57 | Returns:
58 | object: cached value
59 | """
60 | if obj is None:
61 | return self
62 |
63 | # create property if it's not there
64 | cached_name = self.func.__name__
65 | if cached_name not in obj.__dict__:
66 | obj.__dict__[cached_name] = self.func(obj)
67 |
68 | value = obj.__dict__.get(cached_name)
69 | return value
70 |
71 | def __reduce__(self):
72 | return cached_property.__new__, (cached_property,), {'func': self.func}
73 |
74 |
75 | def remove_raw_object(cls):
76 | """
77 | Decorator for Record class.
78 | If a Record class is decorated, raw_object will be removed once all mark properties are cached.
79 | """
80 | cls._remove_raw_object = True
81 | return cls
82 |
83 |
84 | def generate_record_property_cache(obj):
85 | """
86 | Generate final value on all cached_property decorated methods.
87 |
88 | Args:
89 | obj (Record): Record instance.
90 | """
91 | for prop_name, prop_type in obj.__class__.__dict__.items():
92 | if isinstance(prop_type, cached_property):
93 | getattr(obj, prop_name)
94 |
95 | validate_record(obj)
96 |
97 | if obj.__class__._remove_raw_object:
98 | del obj.__dict__['raw_object']
99 |
100 |
101 | def validate_record(obj):
102 | """
103 | Property validator of record instance.
104 |
105 | Args:
106 | obj (Record): Record instance.
107 |
108 | Raises:
109 | TypeError: if id is not valid
110 | """
111 | if not isinstance(obj.id, str):
112 | raise TypeError('Id in {} should be an utf-8 encoded string.'.format(obj.__class__.__name__))
113 | if not re_record_id.match(obj.id):
114 | raise ValueError('Id is not valid')
115 |
116 |
117 | def get_property_names(cls: type):
118 | """
119 | Get keys of property and cached_property from a record class.
120 |
121 | Args:
122 | cls (type): Record class
123 |
124 | Returns:
125 | list: Property names in class
126 | """
127 | keys = []
128 | for prop_name, prop_type in cls.__dict__.items():
129 | if not isinstance(prop_type, property) and not isinstance(prop_type, cached_property):
130 | continue
131 | keys.append(prop_name)
132 | return keys
133 |
134 |
135 | def set_id(key: str, function_: Callable = None, keep_original: bool = False):
136 | """
137 | Decorator for AutoGeneratedRecord class.
138 | If an AutoGeneratedRecord class is decorated, the value of specified key from raw_object will be used as id.
139 |
140 | Args:
141 | key (str): Key in `raw_object`.
142 | function_ (Callable, optional): Function to modify value. Signature is `function_(raw_object[key])`.
143 | Defaults to None.
144 | keep_original (bool, optional): If the original key should be kept. Defaults to False.
145 | """
146 | def wrapper(cls):
147 | cls._id_key = key
148 | cls._id_function = function_
149 | cls._id_keep_original = keep_original
150 | return cls
151 | return wrapper
152 |
153 |
154 | class AutoGeneratedRecord(Record):
155 | """
156 | Properties are auto generated based on the keys in `raw_object`.
157 |
158 | `raw_object` has to contain `id` which used as id in record.
159 |
160 | Args:
161 | raw_object (dict): Raw data which will be used to create properties.
162 | """
163 |
164 | _id_key = 'id'
165 | _id_function = None
166 | _id_keep_original = False
167 |
168 | def __init__(self, raw_object: dict):
169 | super().__init__(raw_object)
170 | for k in raw_object.keys():
171 | if k == self.__class__._id_key:
172 | if not self.__class__._id_keep_original:
173 | continue
174 |
175 | if not hasattr(self.__class__, k):
176 | if not re_valid_property_name.match(k):
177 | raise ValueError('Invalid property name')
178 | setattr(self.__class__, k, self.__class__._generate_property(k))
179 |
180 | @staticmethod
181 | def _generate_property(k):
182 |
183 | @property
184 | def get_value(ins):
185 | return ins.raw_object[k]
186 |
187 | return get_value
188 |
189 | @property
190 | def id(self):
191 | id_ = self.raw_object[self.__class__._id_key]
192 | function_ = self.__class__._id_function
193 | if function_:
194 | id_ = function_(id_)
195 | return id_
196 |
--------------------------------------------------------------------------------
/rltk/remote/__init__.py:
--------------------------------------------------------------------------------
1 | from rltk.remote.remote import Remote
2 | from rltk.remote.task import Task
3 |
--------------------------------------------------------------------------------
/rltk/remote/remote.py:
--------------------------------------------------------------------------------
1 | import os
2 | import glob
3 | from distributed import Worker
4 |
5 | from dask.distributed import Client
6 | from distributed.security import Security
7 |
8 |
9 | class Remote(object):
10 | """
11 | Remote.
12 |
13 | Args:
14 | address (str): Remote scheduler address formed by `ip:port`.
15 | tls_ca_file (str, optional): TLS CA certificate file path. Defaults to None.
16 | tls_client_cert (str, optional): TLS certificate file path. Defaults to None.
17 | tls_client_key (str, optional): TLS private key file path. Defaults to None.
18 | require_encryption (bool, optional): Encrypt data exchange. Defaults to False.
19 |
20 | Note:
21 | TLS will be enabled only if all three TLS arguments are provided.
22 | Remember to change network protocol to `tls://`.
23 | """
24 | def __init__(self, address: str,
25 | tls_ca_file: str = None, tls_client_cert: str = None, tls_client_key: str = None,
26 | require_encryption: bool = False):
27 | # authentication
28 | sec = None
29 | if tls_ca_file and tls_client_cert and tls_client_key:
30 | sec = Security(tls_ca_file=tls_ca_file,
31 | tls_client_cert=tls_client_cert,
32 | tls_client_key=tls_client_key,
33 | require_encryption=require_encryption)
34 |
35 | # init
36 | self._client = Client(address=address, security=sec)
37 | self._client.register_worker_callbacks(Remote._worker_startup)
38 |
39 | @staticmethod
40 | def _worker_startup(dask_worker: Worker):
41 | os.chdir(dask_worker.local_dir)
42 |
43 | def add_dependencies(self, files):
44 | """
45 | Add list of dependencies, order matters.
46 |
47 | Args:
48 | files (list): List of dependent files.
49 | """
50 | # TODO: automatically resolve module dependencies
51 | if isinstance(files, str):
52 | files = [files]
53 | for f in files:
54 | self._client.upload_file(f)
55 |
56 | def scatter(self, *args, **kwargs):
57 | """
58 | Scatter data.
59 | """
60 | return self._client.scatter(*args, **kwargs)
61 |
62 | def submit(self, func, *args, **kwargs):
63 | """
64 | Submit function and data.
65 |
66 | Args:
67 | func (callable): User function.
68 | """
69 | return self._client.submit(func, *args, **kwargs)
70 |
71 | def fetch(self, futures_, **kwargs):
72 | """
73 | Fetch data of future objects.
74 |
75 | Args:
76 | futures_ (list): Future objects.
77 | """
78 | return self._client.gather(futures_, **kwargs)
79 |
80 | def cancel(self, futures_, **kwargs):
81 | """
82 | Cancel job of future objects.
83 |
84 | Args:
85 | futures_ (list): Future objects.
86 | """
87 | return self._client.cancel(futures_, **kwargs)
88 |
89 | def close(self, *args, **kwargs):
90 | """
91 | Close connection.
92 | """
93 | return self._client.close(*args, **kwargs)
94 |
95 | # @staticmethod
96 | # def _list_local_dir(pathname='**', *args, recursive=True):
97 | # non_py_files = []
98 | # py_files = []
99 | # for path in glob.glob(pathname, *args, recursive=recursive):
100 | # if os.path.isdir(path):
101 | # if path == '__pycache__':
102 | # continue
103 | # elif os.path.isfile(path):
104 | # if path.endswith('.pyc'):
105 | # continue
106 | # if path.endswith('.py'):
107 | # py_files.append(path)
108 | # else:
109 | # non_py_files.append(path)
110 | #
111 | # return non_py_files + py_files
112 |
--------------------------------------------------------------------------------
/rltk/remote/task.py:
--------------------------------------------------------------------------------
1 | from threading import Semaphore
2 | from typing import Callable
3 |
4 | from rltk.remote.remote import Remote
5 |
6 |
7 | class Task(object):
8 | """
9 | Remote task. It has similar API to :meth:`rltk.ParallelProcessor`.
10 | But do not use :meth:`rltk.ParallelProcessor` if this module is used. If you still want multiprocessing,
11 | please give each worker more processes.
12 |
13 | Args:
14 | remote (Remote): Remote object.
15 | input_handler (Callable): Input handler.
16 | output_handler (Callable): Output handler. It accepts same number of arguments to `input_handler` 's return values.
17 | chunk_size (int, optional): Size of the each data chunk. Defaults to 1000.
18 | max_queue_size (int, optional): How many chunks can be in the queue. Defaults to 10.
19 | """
20 |
21 | def __init__(self, remote: Remote, input_handler: Callable, output_handler: Callable,
22 | chunk_size: int = 1000, max_queue_size: int = 10):
23 | self.remote = remote
24 | self.input_handler = input_handler
25 | self.output_handler = output_handler
26 |
27 | self.chunk_data = [] # buffer
28 | self.chunk_size = chunk_size # buffer size
29 | self.future_semaphore = Semaphore(value=max_queue_size) # max num of un-return futures
30 | self.all_futures = set([]) # all un-return future objects
31 | self.done = False
32 |
33 | def start(self):
34 | """
35 | Start listening.
36 | """
37 | pass
38 |
39 | @staticmethod
40 | def _parse_input(input_handler, data):
41 | return [input_handler(*args, **kwargs) for args, kwargs in data]
42 |
43 | def _parse_output(self, future):
44 | if future.done():
45 | for r in future.result():
46 | if not isinstance(r, tuple):
47 | r = (r,)
48 | self.output_handler(*r)
49 |
50 | # release resources no matter what condition that future gets
51 | self.all_futures.remove(future)
52 | self.future_semaphore.release()
53 |
54 | def compute(self, *args, **kwargs):
55 | """
56 | Add data to compute.
57 | """
58 | if self.done:
59 | return
60 |
61 | if len(self.chunk_data) < self.chunk_size:
62 | self.chunk_data.append(([*args], {**kwargs}))
63 | if len(self.chunk_data) == self.chunk_size:
64 | self._submit()
65 |
66 | def _submit(self):
67 | if len(self.chunk_data) == 0:
68 | return
69 |
70 | self.future_semaphore.acquire()
71 |
72 | # scatter input data (scatter first if data is large)
73 | data_future = self.remote.scatter(self.chunk_data)
74 |
75 | # input and output must be staticmethod, create wrappers to bypass restriction
76 | future = self.remote.submit(Task._parse_input, self.input_handler, data_future)
77 | Task._parse_output_wrapper = lambda ft: Task._parse_output(self, ft)
78 |
79 | # add listener
80 | future.add_done_callback(Task._parse_output_wrapper)
81 | self.all_futures.add(future)
82 |
83 | self.chunk_data = []
84 |
85 | def task_done(self):
86 | """
87 | Indicate that all resources which need to compute are added.
88 | """
89 | self.done = True
90 | self._submit() # force flush buffer
91 |
92 | def join(self):
93 | """
94 | Block until all tasks are done.
95 | """
96 | while len(self.all_futures) != 0:
97 | pass
98 |
--------------------------------------------------------------------------------
/rltk/similarity/__init__.py:
--------------------------------------------------------------------------------
1 | # common distance
2 | from rltk.similarity.distance import euclidean_distance, euclidean_similarity, \
3 | manhattan_distance, manhattan_similarity
4 |
5 | # normal
6 | from rltk.similarity.equal import string_equal, number_equal
7 | from rltk.similarity.hamming import hamming_distance, hamming_similarity, normalized_hamming_distance
8 | from rltk.similarity.dice import dice_similarity
9 | from rltk.similarity.levenshtein import levenshtein_distance, levenshtein_similarity, \
10 | damerau_levenshtein_distance, damerau_levenshtein_similarity, \
11 | optimal_string_alignment_distance, optimal_string_alignment_similarity
12 | from rltk.similarity.needleman import needleman_wunsch_score, needleman_wunsch_similarity
13 | from rltk.similarity.jaro import jaro_winkler_distance, jaro_winkler_similarity, jaro_distance
14 | from rltk.similarity.jaccard import jaccard_index_similarity, jaccard_index_distance
15 | from rltk.similarity.cosine import cosine_similarity, string_cosine_similarity
16 | from rltk.similarity.tf_idf import tf_idf_similarity, compute_idf, compute_tf, tf_idf_cosine_similarity, TF_IDF
17 | from rltk.similarity.lcs import longest_common_subsequence_distance, metric_longest_common_subsequence
18 | from rltk.similarity.ngram import ngram_distance, ngram_similarity
19 | from rltk.similarity.qgram import qgram_distance, qgram_similarity
20 |
21 | # # hybrid
22 | from rltk.similarity.hybrid import hybrid_jaccard_similarity, monge_elkan_similarity, symmetric_monge_elkan_similarity
23 |
24 | # # phonetic
25 | from rltk.similarity.soundex import soundex_similarity, soundex
26 | from rltk.similarity.metaphone import metaphone_similarity, metaphone
27 | from rltk.similarity.nysiis import nysiis_similarity, nysiis
28 |
--------------------------------------------------------------------------------
/rltk/similarity/cosine.py:
--------------------------------------------------------------------------------
1 | import math
2 | import collections
3 | import rltk.utils as utils
4 |
5 |
6 | def cosine_similarity(vec1, vec2):
7 | """
8 | The cosine similarity between to vectors.
9 |
10 | Args:
11 | vec1 (list): Vector 1. List of integer or float.
12 | vec2 (list): Vector 2. List of integer or float. It should have the same length to vec1.
13 |
14 | Returns:
15 | float: Cosine similarity.
16 |
17 | Examples:
18 | >>> rltk.cosine_similarity([1, 2, 1, 3], [2, 5, 2, 3])
19 | 0.91634193
20 | """
21 |
22 | utils.check_for_none(vec1, vec2)
23 | utils.check_for_type(list, vec1, vec2)
24 | if len(vec1) != len(vec2):
25 | raise ValueError('vec1 and vec2 should have same length')
26 |
27 | v_x_y, v_x_2, v_y_2 = 0.0, 0.0, 0.0
28 | for v1, v2 in zip(vec1, vec2): # list of int / float
29 | v_x_y += v1 * v2
30 | v_x_2 += v1 * v1
31 | v_y_2 += v2 * v2
32 |
33 | return 0.0 if v_x_y == 0 else v_x_y / (math.sqrt(v_x_2) * math.sqrt(v_y_2))
34 |
35 |
36 | def string_cosine_similarity(bag1, bag2):
37 | """
38 | The similarity between the two strings is the cosine of the angle between these two vectors representation.
39 |
40 | Args:
41 | bag1 (list): Bag1, tokenized string sequence.
42 | bag2 (list): Bag2, tokenized string sequence.
43 |
44 | Returns:
45 | float: Cosine similarity.
46 | """
47 |
48 | utils.check_for_none(bag1, bag2)
49 | utils.check_for_type(list, bag1, bag2)
50 |
51 | d1 = collections.Counter(bag1)
52 | d2 = collections.Counter(bag2)
53 |
54 | intersection = set(d1.keys()) & set(d2.keys())
55 | v_x_y = sum([d1[x] * d2[x] for x in intersection])
56 | v_x_2 = sum([v * v for k, v in d1.items()])
57 | v_y_2 = sum([v * v for k, v in d2.items()])
58 |
59 | return 0.0 if v_x_y == 0 else float(v_x_y) / (math.sqrt(v_x_2) * math.sqrt(v_y_2))
60 |
--------------------------------------------------------------------------------
/rltk/similarity/dice.py:
--------------------------------------------------------------------------------
1 | import rltk.utils as utils
2 |
3 |
4 | def dice_similarity(set1, set2):
5 | """
6 | The Dice similarity score is defined as twice the intersection of two sets divided by sum of lengths.
7 |
8 | Args:
9 | set1 (set): Set 1.
10 | set2 (set): Set 2.
11 |
12 | Returns:
13 | float: Dice similarity.
14 |
15 | Examples:
16 | >>> rltk.dice_similarity(set(['a', 'b']), set(['c', 'b']))
17 | 0.5
18 | """
19 |
20 | utils.check_for_none(set1, set2)
21 | utils.check_for_type(set, set1, set2)
22 |
23 | if len(set1) == 0 or len(set2) == 0:
24 | return 0
25 |
26 | return 2.0 * float(len(set1 & set2)) / float(len(set1) + len(set2))
27 |
--------------------------------------------------------------------------------
/rltk/similarity/distance.py:
--------------------------------------------------------------------------------
1 | # https://docs.scipy.org/doc/scipy-0.14.0/reference/spatial.distance.html
2 | from scipy.spatial.distance import euclidean, cityblock
3 |
4 | import rltk.utils as utils
5 |
6 |
7 | def euclidean_distance(vec1, vec2, weights=None):
8 | """
9 | Euclidean distance.
10 |
11 | Args:
12 | vec1 (list): Vector 1. List of integer or float.
13 | vec2 (list): Vector 2. List of integer or float. It should have the same length to vec1.
14 | weights (list): Weights for each value in vectors. If it's None, all weights will be 1.0. Defaults to None.
15 |
16 | Returns:
17 | float: Euclidean distance.
18 | """
19 |
20 | utils.check_for_none(vec1, vec2)
21 | utils.check_for_type(list, vec1, vec2)
22 | if weights:
23 | utils.check_for_type(list, weights)
24 | if len(vec1) != len(vec2):
25 | raise ValueError('vec1 and vec2 should have same length')
26 |
27 | return euclidean(vec1, vec2, weights)
28 |
29 |
30 | def euclidean_similarity(vec1, vec2, weights=None):
31 | """
32 | Computed as 1 / (1 + euclidean_distance)
33 | """
34 | return 1.0 / (1.0 + float(euclidean_distance(vec1, vec2, weights)))
35 |
36 |
37 | def manhattan_distance(vec1, vec2, weights=None):
38 | """
39 | Manhattan distance.
40 |
41 | Args:
42 | vec1 (list): Vector 1. List of integer or float.
43 | vec2 (list): Vector 2. List of integer or float. It should have the same length to vec1.
44 | weights (list): Weights for each value in vectors. If it's None, all weights will be 1.0. Defaults to None.
45 |
46 | Returns:
47 | float: Manhattan distance.
48 | """
49 | utils.check_for_none(vec1, vec2)
50 | utils.check_for_type(list, vec1, vec2)
51 | if weights:
52 | utils.check_for_type(list, weights)
53 | if len(vec1) != len(vec2):
54 | raise ValueError('vec1 and vec2 should have same length')
55 |
56 | return cityblock(vec1, vec2, weights)
57 |
58 |
59 | def manhattan_similarity(vec1, vec2, weights=None):
60 | """
61 | Computed as 1 / (1 + manhattan_distance)
62 | """
63 | return 1.0 / (1.0 + manhattan_distance(vec1, vec2, weights))
64 |
--------------------------------------------------------------------------------
/rltk/similarity/equal.py:
--------------------------------------------------------------------------------
1 | import rltk.utils as utils
2 |
3 |
4 | def string_equal(str1, str2):
5 | """
6 | Args:
7 | n1 (str): String 1.
8 | n2 (str): String 2.
9 |
10 | Returns:
11 | int: 0 for unequal and 1 for equal.
12 | """
13 |
14 | utils.check_for_none(str1, str2)
15 | utils.check_for_type(str, str1, str2)
16 | return int(str1 == str2)
17 |
18 |
19 | def number_equal(num1, num2, epsilon=0):
20 | """
21 | Args:
22 | n1 (int / float): Number 1.
23 | n2 (int / float): Number 2.
24 | epsilon (float, optional): Approximation margin.
25 |
26 | Returns:
27 | int: 0 for unequal and 1 for equal.
28 | """
29 |
30 | utils.check_for_type((int, float), num1, num2)
31 | return int(abs(num1 - num2) <= epsilon)
32 |
--------------------------------------------------------------------------------
/rltk/similarity/hamming.py:
--------------------------------------------------------------------------------
1 | import rltk.utils as utils
2 |
3 |
4 | def hamming_distance(s1, s2):
5 | """
6 | Hamming distance used to measure the minimum number of substitutions required to change one sequence into the
7 | other.
8 |
9 | Args:
10 | s1 (str or list): Sequence 1.
11 | s2 (str or list): Sequence 2.
12 |
13 | Returns:
14 | int: Hamming distance between two sequences.
15 |
16 | Examples:
17 | >>> rltk.hamming_distance('ab','cd')
18 | 2
19 | >>> rltk.hamming_distance([1,2,3],[3,2,3])
20 | 1
21 | """
22 |
23 | utils.check_for_none(s1, s2)
24 | # utils.check_for_type(str, s1, s2)
25 |
26 | if len(s1) != len(s2):
27 | raise ValueError('Unequal length')
28 |
29 | return sum(c1 != c2 for c1, c2 in zip(s1, s2))
30 |
31 |
32 | def normalized_hamming_distance(s1, s2):
33 |
34 | max_len = max(len(s1), len(s2))
35 | if max_len == 0:
36 | return 0
37 |
38 | distance = hamming_distance(s1, s2)
39 | return float(distance) / max_len
40 |
41 |
42 | def hamming_similarity(s1, s2):
43 | """
44 | Hamming similarity is computed as 1 - normalized_hamming_distance.
45 |
46 | Args:
47 | s1 (str or list): Sequence 1.
48 | s2 (str or list): Sequence 2.
49 |
50 | Returns:
51 | float: Hamming similarity.
52 |
53 | Examples:
54 | >>> rltk.hamming_similarity('ab','cd')
55 | 0
56 | >>> rltk.hamming_similarity([1,2,3],[3,2,3])
57 | 0.666666666667
58 | """
59 |
60 | return 1 - normalized_hamming_distance(s1, s2)
61 |
--------------------------------------------------------------------------------
/rltk/similarity/hybrid.py:
--------------------------------------------------------------------------------
1 | from scipy.optimize import linear_sum_assignment
2 | import rltk.utils as utils
3 | from rltk.similarity.jaro import jaro_winkler_similarity
4 |
5 |
6 | def hybrid_jaccard_similarity(set1, set2, threshold=0.5, function=jaro_winkler_similarity,
7 | parameters=None, lower_bound=None):
8 | """
9 | Generalized Jaccard Measure.
10 |
11 | Args:
12 | set1 (set): Set 1.
13 | set2 (set): Set 2.
14 | threshold (float, optional): The threshold to keep the score of similarity function. \
15 | Defaults to 0.5.
16 | function (function, optional): The reference of a similarity measure function. \
17 | It should return the value in range [0,1]. If it is set to None, \
18 | `jaro_winlker_similarity` will be used.
19 | parameters (dict, optional): Other parameters of function. Defaults to None.
20 | lower_bound (float): This is for early exit. If the similarity is not possible to satisfy this value, \
21 | the function returns immediately with the return value 0.0. Defaults to None.
22 |
23 | Returns:
24 | float: Hybrid Jaccard similarity.
25 |
26 | Examples:
27 | >>> def hybrid_test_similarity(m ,n):
28 | ... ...
29 | >>> rltk.hybrid_jaccard_similarity(set(['a','b','c']), set(['p', 'q']), function=hybrid_test_similarity)
30 | 0.533333333333
31 | """
32 |
33 | utils.check_for_none(set1, set2)
34 | utils.check_for_type(set, set1, set2)
35 |
36 | parameters = parameters if isinstance(parameters, dict) else {}
37 |
38 | if len(set1) > len(set2):
39 | set1, set2 = set2, set1
40 | total_num_matches = len(set1)
41 |
42 | matching_score = [[1.0] * len(set2) for _ in range(len(set1))]
43 | row_max = [0.0] * len(set1)
44 | for i, s1 in enumerate(set1):
45 | for j, s2 in enumerate(set2):
46 | score = function(s1, s2, **parameters)
47 | if score < threshold:
48 | score = 0.0
49 | row_max[i] = max(row_max[i], score)
50 | matching_score[i][j] = 1.0 - score # munkres finds out the smallest element
51 |
52 | if lower_bound:
53 | max_possible_score_sum = sum(row_max[:i+1] + [1] * (total_num_matches - i - 1))
54 | max_possible = 1.0 * max_possible_score_sum / float(len(set1) + len(set2) - total_num_matches)
55 | if max_possible < lower_bound:
56 | return 0.0
57 |
58 | # run munkres, finds the min score (max similarity) for each row
59 | row_idx, col_idx = linear_sum_assignment(matching_score)
60 |
61 | # recover scores
62 | score_sum = 0.0
63 | for r, c in zip(row_idx, col_idx):
64 | score_sum += 1.0 - matching_score[r][c]
65 |
66 | if len(set1) + len(set2) - total_num_matches == 0:
67 | return 1.0
68 | sim = float(score_sum) / float(len(set1) + len(set2) - total_num_matches)
69 | if lower_bound and sim < lower_bound:
70 | return 0.0
71 | return sim
72 |
73 |
74 | def monge_elkan_similarity(bag1, bag2, function=jaro_winkler_similarity, parameters=None, lower_bound=None):
75 | """
76 | Monge Elkan similarity.
77 |
78 | Args:
79 | bag1 (list): Bag 1.
80 | bag2 (list): Bag 2.
81 | function (function, optional): The reference of a similarity measure function. \
82 | It should return the value in range [0,1]. If it is set to None, \
83 | `jaro_winlker_similarity` will be used.
84 | parameters (dict, optional): Other parameters of function. Defaults to None.
85 | lower_bound (float): This is for early exit. If the similarity is not possible to satisfy this value, \
86 | the function returns immediately with the return value 0.0. Defaults to None.
87 |
88 | Returns:
89 | float: Monge Elkan similarity.
90 |
91 | Note:
92 | The order of bag1 and bag2 matters. \
93 | Alternatively, `symmetric_monge_elkan_similarity` is not sensitive to the order.
94 | If the `lower_bound` is set, the early exit condition is more easy to be triggered if bag1 has bigger size.
95 | """
96 |
97 | utils.check_for_none(bag1, bag2)
98 | utils.check_for_type(list, bag1, bag2)
99 |
100 | parameters = parameters if isinstance(parameters, dict) else {}
101 |
102 | score_sum = 0
103 | for idx, ele1 in enumerate(bag1):
104 | max_score = utils.MIN_FLOAT
105 | for ele2 in bag2:
106 | max_score = max(max_score, function(ele1, ele2, **parameters))
107 | score_sum += max_score
108 |
109 | # if it satisfies early exit condition
110 | if lower_bound:
111 | rest_max = len(bag1) - 1 - idx # assume the rest scores are all 1
112 | if float(score_sum + rest_max) / float(len(bag1)) < lower_bound:
113 | return 0.0
114 |
115 | sim = float(score_sum) / float(len(bag1))
116 | if lower_bound and sim < lower_bound:
117 | return 0.0
118 | return sim
119 |
120 |
121 | def symmetric_monge_elkan_similarity(bag1, bag2, function=jaro_winkler_similarity, parameters=None, lower_bound=None):
122 | """
123 | Symmetric Monge Elkan similarity is computed by \
124 | (monge_elkan_similarity(b1, b2) + monge_elkan_similarity(b2, b1)) / 2.
125 |
126 | Note:
127 | If `lower_bound` is given, the return will be zero unless \
128 | both `monge_elkan_similarity`s are greater than it.
129 | """
130 |
131 | s1 = monge_elkan_similarity(bag1, bag2, function, parameters, lower_bound=lower_bound)
132 | if lower_bound and s1 == 0:
133 | return 0.0
134 | s2 = monge_elkan_similarity(bag2, bag1, function, parameters, lower_bound=lower_bound)
135 | if lower_bound and s2 == 0:
136 | return 0.0
137 | return (s1 + s2) / 2
138 |
--------------------------------------------------------------------------------
/rltk/similarity/jaccard.py:
--------------------------------------------------------------------------------
1 | import rltk.utils as utils
2 |
3 |
4 | def _jaccard_index(set1, set2):
5 | utils.check_for_none(set1, set2)
6 | utils.check_for_type(set, set1, set2)
7 |
8 | if len(set1) == 0 or len(set2) == 0:
9 | return 0
10 |
11 | # return float(len(set1 & set2)) / float(len(set1 | set2))
12 |
13 | inter_len = len(set1 & set2)
14 | return float(inter_len) / (len(set1) + len(set2) - inter_len)
15 |
16 |
17 | def jaccard_index_similarity(set1, set2):
18 | """
19 | The Jaccard Index Similarity is then computed as intersection(set1, set2) / union(set1, set2).
20 |
21 | Args:
22 | set1 (set): Set 1.
23 | set2 (set): Set 2.
24 |
25 | Returns:
26 | float: Jaccard Index similarity.
27 |
28 | Examples:
29 | >>> rltk.jaccard_index_similarity(set(['a','b']), set(['a','c']))
30 | 0.3333333333333333
31 | >>> rltk.jaccard_index_similarity(set(['a','b']), set(['c','d']))
32 | 0.0
33 | """
34 | return _jaccard_index(set1, set2)
35 |
36 |
37 | def jaccard_index_distance(set1, set2):
38 | """
39 | The Jaccard Index Distance is then computed as 1 - jaccard_index_similarity.
40 |
41 | Args:
42 | set1 (set): Set 1.
43 | set2 (set): Set 2.
44 |
45 | Returns:
46 | int: Jaccard Index Distance.
47 | """
48 | return 1 - jaccard_index_similarity(set1, set2)
49 |
--------------------------------------------------------------------------------
/rltk/similarity/jaro.py:
--------------------------------------------------------------------------------
1 | import math
2 | import rltk.utils as utils
3 |
4 |
5 | def _jaro_winkler(s1, s2, threshold=0.7, scaling_factor=0.1, prefix_len=4):
6 | jaro = _jaro_distance(s1, s2)
7 | if jaro > threshold:
8 | l = min(len(_get_prefix(s1, s2)), prefix_len) # max len of common prefix is 4
9 | jaro += (scaling_factor * l * (1.0 - jaro))
10 | return jaro
11 |
12 |
13 | def jaro_winkler_similarity(s1, s2, threshold=0.7, scaling_factor=0.1, prefix_len=4):
14 | """
15 | The max length for common prefix is 4.
16 |
17 | Args:
18 | s1 (str): Sequence 1.
19 | s2 (str): Sequence 2.
20 | threshold (int, optional): Boost threshold, prefix bonus is only added when compared strings have a Jaro Distance above it. Defaults to 0.7.
21 | scaling_factor (int, optional): Scaling factor for how much the score is adjusted upwards for having common prefixes. Defaults to 0.1.
22 |
23 | Returns:
24 | float: Jaro Winkler Similarity.
25 |
26 | Examples:
27 | >>> rltk.jaro_winkler_similarity('abchello', 'abcworld')
28 | 0.6833333333333332
29 | >>> rltk.jaro_winkler_similarity('hello', 'world')
30 | 0.4666666666666666
31 | """
32 | return _jaro_winkler(s1, s2, threshold, scaling_factor, prefix_len)
33 |
34 |
35 | def jaro_winkler_distance(s1, s2, threshold=0.7, scaling_factor=0.1, prefix_len=4):
36 | """
37 | Jaro Winkler Distance is computed as 1 - jaro_winkler_similarity.
38 |
39 | Args:
40 | s1 (str): Sequence 1.
41 | s2 (str): Sequence 2.
42 | threshold (int, optional): Boost threshold, prefix bonus is only added when compared strings have a Jaro Distance above it. Defaults to 0.7.
43 | scaling_factor (int, optional): Scaling factor for how much the score is adjusted upwards for having common prefixes. Defaults to 0.1.
44 |
45 | Returns:
46 | float: Jaro Winkler Similarity.
47 |
48 | Examples:
49 | >>> rltk.jaro_winkler_similarity('abchello', 'abcworld')
50 | 0.6833333333333332
51 | >>> rltk.jaro_winkler_similarity('hello', 'world')
52 | 0.4666666666666666
53 | """
54 | return 1 - _jaro_winkler(s1, s2, threshold, scaling_factor, prefix_len)
55 |
56 |
57 | def jaro_distance(s1, s2):
58 | """
59 | Args:
60 | s1 (str): Sequence 1.
61 | s2 (str): Sequence 2.
62 |
63 | Returns:
64 | float: Jaro Distance.
65 |
66 | Examples:
67 | >>> rltk.jaro_distance('abc', 'abd')
68 | 0.7777777777777777
69 | >>> rltk.jaro_distance('abccd', 'abcdc')
70 | 0.9333333333333332
71 | """
72 | return _jaro_distance(s1, s2)
73 |
74 |
75 | def _jaro_distance(s1, s2):
76 | # code from https://github.com/nap/jaro-winkler-distance
77 | # Copyright Jean-Bernard Ratte
78 |
79 | utils.check_for_none(s1, s2)
80 | utils.check_for_type(str, s1, s2)
81 |
82 | # s1 = utils.unicode_normalize(s1)
83 | # s2 = utils.unicode_normalize(s2)
84 |
85 | shorter, longer = s1.lower(), s2.lower()
86 |
87 | if len(s1) > len(s2):
88 | longer, shorter = shorter, longer
89 |
90 | m1 = _get_matching_characters(shorter, longer)
91 | m2 = _get_matching_characters(longer, shorter)
92 |
93 | if len(m1) == 0 or len(m2) == 0:
94 | return 0.0
95 |
96 | return (float(len(m1)) / len(shorter) +
97 | float(len(m2)) / len(longer) +
98 | float(len(m1) - _transpositions(m1, m2)) / len(m1)) / 3.0
99 |
100 |
101 | def _get_diff_index(first, second):
102 | if first == second:
103 | return -1
104 |
105 | if not first or not second:
106 | return 0
107 |
108 | max_len = min(len(first), len(second))
109 | for i in range(0, max_len):
110 | if not first[i] == second[i]:
111 | return i
112 |
113 | return max_len
114 |
115 |
116 | def _get_prefix(first, second):
117 | if not first or not second:
118 | return ''
119 |
120 | index = _get_diff_index(first, second)
121 | if index == -1:
122 | return first
123 | elif index == 0:
124 | return ''
125 | else:
126 | return first[0:index]
127 |
128 |
129 | def _get_matching_characters(first, second):
130 | common = []
131 | limit = math.floor(min(len(first), len(second)) / 2)
132 |
133 | for i, l in enumerate(first):
134 | left, right = int(max(0, i - limit)), int(min(i + limit + 1, len(second)))
135 | if l in second[left:right]:
136 | common.append(l)
137 | second = second[0:second.index(l)] + '*' + second[second.index(l) + 1:]
138 |
139 | return ''.join(common)
140 |
141 |
142 | def _transpositions(first, second):
143 | return math.floor(len([(f, s) for f, s in zip(first, second) if not f == s]) / 2.0)
144 |
--------------------------------------------------------------------------------
/rltk/similarity/lcs.py:
--------------------------------------------------------------------------------
1 | import rltk.utils as utils
2 |
3 |
4 | def _lcs(s1, s2):
5 | m, n = len(s1), len(s2)
6 |
7 | dp = [[None] * (n + 1) for i in range(m + 1)]
8 |
9 | for i in range(m + 1):
10 | for j in range(n + 1):
11 | if i == 0 or j == 0:
12 | dp[i][j] = 0
13 | elif s1[i - 1] == s2[j - 1]:
14 | dp[i][j] = dp[i - 1][j - 1] + 1
15 | else:
16 | dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])
17 |
18 | return dp[m][n]
19 |
20 |
21 | def longest_common_subsequence_distance(s1, s2):
22 | """
23 | The LCS distance between strings X (of length n) and Y (of length m) is n + m - 2 |LCS(X, Y)| min = 0 max = n + m
24 |
25 | Args:
26 | s1 (str): Sequence 1.
27 | s2 (str): Sequence 2.
28 |
29 | Returns:
30 | float: Longest Common Subsequence Distance.
31 |
32 | Examples:
33 | >>> rltk.longest_common_subsequence_distance('abcd', 'acbd')
34 | 2
35 | >>> rltk.longest_common_subsequence_distance('abcdefg', 'acef')
36 | 3
37 | """
38 | utils.check_for_none(s1, s2)
39 | utils.check_for_type(str, s1, s2)
40 |
41 | m, n = len(s1), len(s2)
42 |
43 | # dp = [[None] * (n + 1) for i in range(m + 1)]
44 |
45 | lcs = _lcs(s1, s2)
46 | return n + m - 2 * lcs
47 |
48 |
49 | def metric_longest_common_subsequence(s1, s2):
50 | """
51 | The Metric LCS distance between 2 strings is similar to LCS between 2 string where Metric Longest Common Subsequence is computed as 1 - |LCS(s1, s2)| / max(|s1|, |s2|)
52 |
53 | Args:
54 | s1 (str): Sequence 1.
55 | s2 (str): Sequence 2.
56 |
57 | Returns:
58 | float: Metric Longest Common Subsequence Distance.
59 |
60 | Examples:
61 | >>> rltk.longest_common_subsequence('ABCDEFG', 'ABCDEFHJKL')
62 | 0.4
63 | # LCS: ABCDEF => length = 6
64 | # longest = s2 => length = 10
65 | # => 1 - 6/10 = 0.4
66 |
67 | >>> rltk.optimal_string_alignment_distance('ABDEF', 'ABDIF')
68 | 4
69 | # LCS: ABDF => length = 4
70 | # longest = ABDEF => length = 5
71 | # => 1 - 4 / 5 = 0.2
72 | """
73 | utils.check_for_none(s1, s2)
74 | utils.check_for_type(str, s1, s2)
75 |
76 | lcs = _lcs(s1, s2)
77 | return 1 - float(lcs) / max(len(s1), len(s2), 1)
78 |
--------------------------------------------------------------------------------
/rltk/similarity/metaphone.py:
--------------------------------------------------------------------------------
1 | import rltk.utils as utils
2 |
3 |
4 | def metaphone(s):
5 | """
6 | Metaphone fundamentally improves on the Soundex algorithm by using information about variations and inconsistencies in English spelling and pronunciation to produce a more accurate encoding, which does a better job of matching words and names which sound similar. As with Soundex, similar-sounding words should share the same keys. Metaphone is available as a built-in operator in a number of systems.
7 |
8 | Args:
9 | s (str): Sequence.
10 |
11 | Returns:
12 | str: Coded sequence.
13 |
14 | Examples:
15 | >>> rltk.metaphone('ashcraft')
16 | 'AXKRFT'
17 | >>> rltk.metaphone('pineapple')
18 | 'PNPL'
19 | """
20 | # code from https://github.com/jamesturk/jellyfish
21 | # Copyright (c) 2015, James Turk
22 | # Copyright (c) 2015, Sunlight Foundation
23 | # All rights reserved.
24 |
25 | utils.check_for_none(s)
26 | utils.check_for_type(str, s)
27 |
28 | s = utils.unicode_normalize(s)
29 |
30 | if len(s) == 0:
31 | raise ValueError('Empty string')
32 |
33 | s = s.lower()
34 | result = []
35 |
36 | # skip first character if s starts with these
37 | if s.startswith(('kn', 'gn', 'pn', 'ac', 'wr', 'ae')):
38 | s = s[1:]
39 |
40 | i = 0
41 |
42 | while i < len(s):
43 | c = s[i]
44 | next_ = s[i+1] if i < len(s)-1 else '*****'
45 | nextnext = s[i+2] if i < len(s)-2 else '*****'
46 |
47 | # skip doubles except for cc
48 | if c == next_ and c != 'c':
49 | i += 1
50 | continue
51 |
52 | if c in 'aeiou':
53 | if i == 0 or s[i-1] == ' ':
54 | result.append(c)
55 | elif c == 'b':
56 | if (not (i != 0 and s[i-1] == 'm')) or next_:
57 | result.append('b')
58 | elif c == 'c':
59 | if next_ == 'i' and nextnext == 'a' or next_ == 'h':
60 | result.append('x')
61 | i += 1
62 | elif next_ in 'iey':
63 | result.append('s')
64 | i += 1
65 | else:
66 | result.append('k')
67 | elif c == 'd':
68 | if next_ == 'g' and nextnext in 'iey':
69 | result.append('j')
70 | i += 2
71 | else:
72 | result.append('t')
73 | elif c in 'fjlmnr':
74 | result.append(c)
75 | elif c == 'g':
76 | if next_ in 'iey':
77 | result.append('j')
78 | elif next_ not in 'hn':
79 | result.append('k')
80 | elif next_ == 'h' and nextnext and nextnext not in 'aeiou':
81 | i += 1
82 | elif c == 'h':
83 | if i == 0 or next_ in 'aeiou' or s[i-1] not in 'aeiou':
84 | result.append('h')
85 | elif c == 'k':
86 | if i == 0 or s[i-1] != 'c':
87 | result.append('k')
88 | elif c == 'p':
89 | if next_ == 'h':
90 | result.append('f')
91 | i += 1
92 | else:
93 | result.append('p')
94 | elif c == 'q':
95 | result.append('k')
96 | elif c == 's':
97 | if next_ == 'h':
98 | result.append('x')
99 | i += 1
100 | elif next_ == 'i' and nextnext in 'oa':
101 | result.append('x')
102 | i += 2
103 | else:
104 | result.append('s')
105 | elif c == 't':
106 | if next_ == 'i' and nextnext in 'oa':
107 | result.append('x')
108 | elif next_ == 'h':
109 | result.append('0')
110 | i += 1
111 | elif next_ != 'c' or nextnext != 'h':
112 | result.append('t')
113 | elif c == 'v':
114 | result.append('f')
115 | elif c == 'w':
116 | if i == 0 and next_ == 'h':
117 | i += 1
118 | if nextnext in 'aeiou' or nextnext == '*****':
119 | result.append('w')
120 | elif c == 'x':
121 | if i == 0:
122 | if next_ == 'h' or (next_ == 'i' and nextnext in 'oa'):
123 | result.append('x')
124 | else:
125 | result.append('s')
126 | else:
127 | result.append('k')
128 | result.append('s')
129 | elif c == 'y':
130 | if next_ in 'aeiou':
131 | result.append('y')
132 | elif c == 'z':
133 | result.append('s')
134 | elif c == ' ':
135 | if len(result) > 0 and result[-1] != ' ':
136 | result.append(' ')
137 |
138 | i += 1
139 |
140 | return ''.join(result).upper()
141 |
142 |
143 | def metaphone_similarity(s1, s2):
144 | """
145 | metaphone(s1) == metaphone(s2)
146 |
147 | Args:
148 | s1 (str): Sequence.
149 | s2 (str): Sequence.
150 |
151 | Returns:
152 | float: if metaphone(s1) equals to metaphone(s2)
153 | """
154 | return 1 if metaphone(s1) == metaphone(s2) else 0
155 |
--------------------------------------------------------------------------------
/rltk/similarity/needleman.py:
--------------------------------------------------------------------------------
1 | import rltk.utils as utils
2 |
3 |
4 | def _get_score(c1, c2, match, mismatch, score_table):
5 | """
6 | if there's no score found in score_table, match & mismatch will be used.
7 | """
8 | if score_table and c1 in score_table and c2 in score_table[c1]:
9 | return score_table[c1][c2]
10 | else:
11 | return match if c1 == c2 else mismatch
12 |
13 |
14 | def needleman_wunsch_score(s1, s2, match=2, mismatch=-1, gap=-0.5, score_table=None):
15 | """
16 | Neeldman Wunsch score
17 | """
18 | utils.check_for_none(s1, s2)
19 | utils.check_for_type(str, s1, s2)
20 |
21 | score_table = score_table if isinstance(score_table, dict) else {}
22 |
23 | # s1 = utils.unicode_normalize(s1)
24 | # s2 = utils.unicode_normalize(s2)
25 |
26 | n1, n2 = len(s1), len(s2)
27 | if n1 == 0 and n2 == 0:
28 | return 0
29 |
30 | # construct matrix to get max score of all possible alignments
31 | dp = [[0] * (n2 + 1) for _ in range(n1 + 1)]
32 | for i in range(n1 + 1):
33 | for j in range(n2 + 1):
34 | if i == 0 and j == 0: # [0,0]
35 | continue
36 | elif i == 0: # most top row
37 | dp[i][j] = gap + dp[i][j - 1]
38 | elif j == 0: # most left column
39 | dp[i][j] = gap + dp[i - 1][j]
40 | else:
41 | dp[i][j] = max(dp[i][j - 1] + gap,
42 | dp[i - 1][j] + gap,
43 | dp[i - 1][j - 1] + _get_score(s1[i - 1], s2[j - 1], match, mismatch, score_table))
44 |
45 | return dp[n1][n2]
46 |
47 |
48 | def needleman_wunsch_similarity(s1, s2, match=2, mismatch=-1, gap=-0.5, score_table=None):
49 | """
50 | This Needleman Wunsch Similarity is computed as needlman_wunsch_score over maximum score of s1 and s2.
51 |
52 | Args:
53 | s1 (str): Sequence 1.
54 | s2 (str): Sequence 2.
55 | match (int, optional): Score of match.
56 | mismatch (int, optional): Score of mismatch.
57 | gap (int, optional): Gap penalty.
58 | score_dict (dict): Alignment score matrix. Default to None.
59 |
60 | Returns:
61 | float: Needleman Wunsch Similarity.
62 | """
63 |
64 | nm = needleman_wunsch_score(s1, s2, match, mismatch, gap, score_table)
65 |
66 | # score_table = {'a': {'c': 3}, 'e': {'f': 9, 'k': 1}}
67 | score_s1 = sum([_get_score(c1, c1, match, mismatch, score_table) for c1 in s1])
68 | score_s2 = sum([_get_score(c2, c2, match, mismatch, score_table) for c2 in s2])
69 |
70 | max_score = max(score_s1, score_s2)
71 |
72 | if max_score < nm:
73 | raise ValueError('Illegal value of score_table')
74 |
75 | return float(nm) / max_score
76 |
--------------------------------------------------------------------------------
/rltk/similarity/ngram.py:
--------------------------------------------------------------------------------
1 | import rltk.utils as utils
2 |
3 |
4 | def ngram_distance(s0, s1, n=2):
5 | """
6 | N-Gram Distance as defined by Kondrak, "N-Gram Similarity and Distance" String Processing and Information Retrieval, Lecture Notes in Computer Science Volume 3772, 2005, pp 115-126.
7 |
8 | Args:
9 | s1 (str): Sequence 1.
10 | s2 (str): Sequence 2.
11 |
12 | Returns:
13 | float: NGram Distance.
14 |
15 | Examples:
16 | >>> rltk.ngram_distance('ABCD', 'ABTUIO')
17 | 0.5833
18 | """
19 |
20 | utils.check_for_none(s0, s1)
21 | utils.check_for_type(str, s0, s1)
22 |
23 | n1, n2 = len(s0), len(s1)
24 | special = "\n"
25 |
26 | if (n1 == 0 or n2 == 0):
27 | return 1
28 |
29 | if (s0 == s1):
30 | return 0
31 |
32 | cost = 0
33 | if (n1 < n or n2 < n):
34 | return 1
35 |
36 | # Adding special chars (n-1) to s0
37 | sa = special * (n - 1) + s0
38 |
39 | s2_j = [None] * n # jth n-gram of s2
40 | d = [0] * (n1 + 1) # cost array, horizontally
41 | p = [0] * (n1 + 1) # 'previous' cost array, horizontally
42 |
43 | for i in range(n1 + 1):
44 | p[i] = i
45 |
46 | for j in range(1, n2 + 1):
47 | # Construct s2_j n-gram
48 | if (j < n):
49 | for ti in range(n - j):
50 | s2_j[ti] = special
51 |
52 | for ti in range(n - j, n):
53 | s2_j[ti] = s1[ti - (n - j)]
54 |
55 | else:
56 | s2_j = list(s1[j - n: j])
57 |
58 | d[0] = j
59 |
60 | for i in range(1, n1 + 1):
61 | cost = 0
62 | tn = n
63 | # Compare sa to s2_j
64 | for ni in range(n):
65 | if sa[i - 1 + ni] != s2_j[ni]:
66 | cost += 1
67 | elif sa[i - 1 + ni] == special:
68 | tn -= 1
69 |
70 | ec = float(cost) / tn
71 | # minimum of cell to the left+1, to the top+1,
72 | # diagonally left and up +cost
73 | d[i] = min(d[i - 1] + 1, p[i] + 1, p[i - 1] + ec)
74 |
75 | d2 = p
76 | p = d
77 | d = d2
78 | return float(p[n1]) / max(n2, n1)
79 |
80 |
81 | def ngram_similarity(s0, s1, n=2):
82 | """
83 | N-Gram Similarity as defined by Kondrak, "N-Gram Similarity and Distance" String Processing and Information Retrieval, Lecture Notes in Computer Science Volume 3772, 2005, pp 115-126.
84 |
85 | Args:
86 | s1 (str): Sequence 1.
87 | s2 (str): Sequence 2.
88 |
89 | Returns:
90 | float: NGram Similarity.
91 |
92 | Examples:
93 | >>> rltk.ngram_similarity('ABCD', 'ABTUIO')
94 | 0.4166666666666667
95 | """
96 |
97 | utils.check_for_none(s0, s1)
98 | utils.check_for_type(str, s0, s1)
99 |
100 | n1, n2 = len(s0), len(s1)
101 | special = "\n"
102 |
103 | if (n1 == 0 or n2 == 0):
104 | return 0
105 |
106 | if (s0 == s1):
107 | return 1
108 |
109 | cost = 0
110 | if (n1 < n or n2 < n):
111 | return 0
112 |
113 | # Adding special chars (n-1) to s0
114 | sa = special * (n - 1) + s0
115 |
116 | s2_j = [None] * n # jth n-gram of s2
117 | d = [0] * (n1 + 1) # cost array, horizontally
118 | p = [0] * (n1 + 1) # 'previous' cost array, horizontally
119 |
120 | for i in range(n1 + 1):
121 | p[i] = 0
122 |
123 | for j in range(1, n2 + 1):
124 | # Construct s2_j n-gram
125 | if (j < n):
126 | for ti in range(n - j):
127 | s2_j[ti] = special
128 |
129 | for ti in range(n - j, n):
130 | s2_j[ti] = s1[ti - (n - j)]
131 |
132 | else:
133 | s2_j = list(s1[j - n: j])
134 |
135 | d[0] = 0
136 |
137 | for i in range(1, n1 + 1):
138 | cost = 0
139 | tn = n
140 | # Compare sa to s2_j
141 | for ni in range(n):
142 | if sa[i - 1 + ni] == s2_j[ni] and sa[i - 1 + ni] != "\n":
143 | cost += 1
144 | elif sa[i - 1 + ni] == special:
145 | tn -= 1
146 |
147 | ec = float(cost) / tn
148 | # minimum of cell to the left+1, to the top+1,
149 | # diagonally left and up +cost
150 | d[i] = max(d[i - 1], p[i], p[i - 1] + ec)
151 |
152 | d2 = p
153 | p = d
154 | d = d2
155 | return float(p[n1]) / max(n2, n1)
156 |
--------------------------------------------------------------------------------
/rltk/similarity/nysiis.py:
--------------------------------------------------------------------------------
1 | import rltk.utils as utils
2 |
3 |
4 | def nysiis(s):
5 | """
6 | New York State Immunization Information System (NYSIIS) Phonetic Code is a phonetic algorithm created by `The New York State Department of Health's (NYSDOH) Bureau of Immunization
7 | `_.
8 |
9 | Args:
10 | s1 (str): Sequence 1.
11 | s2 (str): Sequence 2.
12 |
13 | Returns:
14 | int: 1 for same NYSIIS code, 0 for different.
15 |
16 | Examples:
17 | >>> rltk.nysiis_similarity('ashcraft', 'pineapple')
18 | 0
19 | """
20 | # code from https://github.com/jamesturk/jellyfish
21 | # Copyright (c) 2015, James Turk
22 | # Copyright (c) 2015, Sunlight Foundation
23 | # All rights reserved.
24 |
25 | utils.check_for_none(s)
26 | utils.check_for_type(str, s)
27 |
28 | s = utils.unicode_normalize(s)
29 |
30 | if len(s) == 0:
31 | raise ValueError('Empty string')
32 |
33 | s = s.upper()
34 | key = []
35 |
36 | # step 1 - prefixes
37 | if s.startswith('MAC'):
38 | s = 'MCC' + s[3:]
39 | elif s.startswith('KN'):
40 | s = s[1:]
41 | elif s.startswith('K'):
42 | s = 'C' + s[1:]
43 | elif s.startswith(('PH', 'PF')):
44 | s = 'FF' + s[2:]
45 | elif s.startswith('SCH'):
46 | s = 'SSS' + s[3:]
47 |
48 | # step 2 - suffixes
49 | if s.endswith(('IE', 'EE')):
50 | s = s[:-2] + 'Y'
51 | elif s.endswith(('DT', 'RT', 'RD', 'NT', 'ND')):
52 | s = s[:-2] + 'D'
53 |
54 | # step 3 - first character of key comes from name
55 | key.append(s[0])
56 |
57 | # step 4 - translate remaining chars
58 | i = 1
59 | len_s = len(s)
60 | while i < len_s:
61 | ch = s[i]
62 | if ch == 'E' and i + 1 < len_s and s[i + 1] == 'V':
63 | ch = 'AF'
64 | i += 1
65 | elif ch in 'AEIOU':
66 | ch = 'A'
67 | elif ch == 'Q':
68 | ch = 'G'
69 | elif ch == 'Z':
70 | ch = 'S'
71 | elif ch == 'M':
72 | ch = 'N'
73 | elif ch == 'K':
74 | if i + 1 < len(s) and s[i + 1] == 'N':
75 | ch = 'N'
76 | else:
77 | ch = 'C'
78 | elif ch == 'S' and s[i + 1:i + 3] == 'CH':
79 | ch = 'SS'
80 | i += 2
81 | elif ch == 'P' and i + 1 < len(s) and s[i + 1] == 'H':
82 | ch = 'F'
83 | i += 1
84 | elif ch == 'H' and (s[i - 1] not in 'AEIOU' or (i + 1 < len(s) and s[i + 1] not in 'AEIOU')):
85 | if s[i - 1] in 'AEIOU':
86 | ch = 'A'
87 | else:
88 | ch = s[i - 1]
89 | elif ch == 'W' and s[i - 1] in 'AEIOU':
90 | ch = s[i - 1]
91 |
92 | if ch[-1] != key[-1][-1]:
93 | key.append(ch)
94 |
95 | i += 1
96 |
97 | key = ''.join(key)
98 |
99 | # step 5 - remove trailing S
100 | if key.endswith('S') and key != 'S':
101 | key = key[:-1]
102 |
103 | # step 6 - replace AY w/ Y
104 | if key.endswith('AY'):
105 | key = key[:-2] + 'Y'
106 |
107 | # step 7 - remove trailing A
108 | if key.endswith('A') and key != 'A':
109 | key = key[:-1]
110 |
111 | # step 8 was already done
112 |
113 | return key
114 |
115 |
116 | def nysiis_similarity(s1, s2):
117 | """
118 | nysiis(s1) == nysiis(s2)
119 |
120 | Args:
121 | s1 (str): Sequence.
122 | s2 (str): Sequence.
123 |
124 | Returns:
125 | float: if nysiis(s1) equals to nysiis(s2)
126 | """
127 | return 1 if nysiis(s1) == nysiis(s2) else 0
128 |
--------------------------------------------------------------------------------
/rltk/similarity/qgram.py:
--------------------------------------------------------------------------------
1 | import rltk.utils as utils
2 |
3 |
4 | def get_ngrams(s, n):
5 | all_ngrams = list()
6 | for i in range(len(s) - 1):
7 | if i + n <= len(s):
8 | all_ngrams.append(s[i:i + n])
9 |
10 | return set(all_ngrams)
11 |
12 |
13 | def qgram_distance(s0, s1, n=2):
14 | """
15 | QGram Distance is the number of distinct q-grams (n-grams) between 2 strings
16 |
17 | Args:
18 | s1 (str): Sequence 1.
19 | s2 (str): Sequence 2.
20 |
21 | Returns:
22 | float: QGram Distance.
23 |
24 | Examples:
25 | >>> rltk.qgram_distance('abcde','abdcde')
26 | 3
27 | """
28 | if n > max(len(s0), len(s1)):
29 | return 1
30 |
31 | s0_ngrams = get_ngrams(s0, n)
32 | s1_ngrams = get_ngrams(s1, n)
33 | all_ngrams = list(s0_ngrams | s1_ngrams)
34 |
35 | v0 = [1 if all_ngrams[i] in s0 else 0 for i in range(len(all_ngrams))]
36 | v1 = [1 if all_ngrams[i] in s1 else 0 for i in range(len(all_ngrams))]
37 |
38 | return sum([1 if v0[i] != v1[i] else 0 for i in range(len(v0))])
39 |
40 |
41 | def qgram_similarity(s0, s1, n=2):
42 | """
43 | QGram Similarity is the number of common q-grams (n-grams) between 2 strings
44 |
45 | Args:
46 | s1 (str): Sequence 1.
47 | s2 (str): Sequence 2.
48 |
49 | Returns:
50 | float: QGram Similarity.
51 |
52 | Examples:
53 | >>> rltk.qgram_similarity('abcde','abdcde')
54 | 3
55 | """
56 |
57 | if n > max(len(s0), len(s1)):
58 | return 0
59 |
60 | s0_ngrams = get_ngrams(s0, n)
61 | s1_ngrams = get_ngrams(s1, n)
62 | all_ngrams = list(s0_ngrams | s1_ngrams)
63 |
64 | v0 = [1 if all_ngrams[i] in s0 else 0 for i in range(len(all_ngrams))]
65 | v1 = [1 if all_ngrams[i] in s1 else 0 for i in range(len(all_ngrams))]
66 |
67 | return sum([1 if v0[i] == v1[i] else 0 for i in range(len(v0))])
68 |
--------------------------------------------------------------------------------
/rltk/similarity/soundex.py:
--------------------------------------------------------------------------------
1 | import rltk.utils as utils
2 |
3 |
4 | def soundex(s):
5 | """
6 | The standard used for this implementation is provided by `U.S. Census Bureau `_.
7 |
8 | Args:
9 | s (str): Sequence.
10 |
11 | Returns:
12 | str: Coded sequence.
13 |
14 | Examples:
15 | >>> rltk.soundex('ashcraft')
16 | 'A261'
17 | >>> rltk.soundex('pineapple')
18 | 'P514'
19 | """
20 |
21 | utils.check_for_none(s)
22 | utils.check_for_type(str, s)
23 |
24 | s = utils.unicode_normalize(s)
25 |
26 | if len(s) == 0:
27 | raise ValueError('Empty string')
28 |
29 | s = s.upper()
30 |
31 | CODES = (
32 | ('BFPV', '1'),
33 | ('CGJKQSXZ', '2'),
34 | ('DT', '3'),
35 | ('L', '4'),
36 | ('MN', '5'),
37 | ('R', '6'),
38 | ('AEIOUHWY', '.') # placeholder
39 | )
40 | CODE_DICT = dict((c, replace) for chars, replace in CODES for c in chars)
41 |
42 | sdx = s[0]
43 | for i in range(1, len(s)):
44 | if s[i] not in CODE_DICT:
45 | continue
46 |
47 | code = CODE_DICT[s[i]]
48 | if code == '.':
49 | continue
50 | if s[i] == s[i - 1]: # ignore same letter
51 | continue
52 | if s[i - 1] in CODE_DICT and CODE_DICT[s[i - 1]] == code: # 'side-by-side' rule
53 | continue
54 | if s[i - 1] in ('H', 'W') and i - 2 > 0 and \
55 | s[i - 2] in CODE_DICT and CODE_DICT[s[i - 2]] != '.': # consonant separators
56 | continue
57 |
58 | sdx += code
59 |
60 | sdx = sdx[0:4].ljust(4, '0')
61 |
62 | return sdx
63 |
64 |
65 | def soundex_similarity(s1, s2):
66 | """
67 | soundex(s1) == soundex(s2)
68 |
69 | Args:
70 | s1 (str): Sequence.
71 | s2 (str): Sequence.
72 |
73 | Returns:
74 | float: if soundex(s1) equals to soundex(s2)
75 | """
76 | return 1 if soundex(s1) == soundex(s2) else 0
77 |
--------------------------------------------------------------------------------
/rltk/similarity/tf_idf.py:
--------------------------------------------------------------------------------
1 | import collections
2 | import math
3 |
4 | import rltk.utils as utils
5 |
6 |
7 | def tf_idf_similarity(bag1, bag2, df_corpus, doc_size, math_log=False):
8 | """
9 | Computes TF/IDF measure. This measure employs the notion of TF/IDF score commonly used in information retrieval (IR) to find documents that are relevant to keyword queries. The intuition underlying the TF/IDF measure is that two strings are similar if they share distinguishing terms.
10 |
11 | Note:
12 | If you will call this function many times, :meth:`TF_IDF` is more efficient.
13 |
14 | Args:
15 | bag1 (list): Bag 1.
16 | bag2 (list): Bag 2.
17 | df_corpus (dict): The pre calculated document frequency of corpus.
18 | doc_size (int): total documents used in corpus.
19 | math_log (bool, optional): Flag to indicate whether math.log() should be used in TF and IDF formulas. Defaults to False.
20 |
21 | Returns:
22 | float: TF/IDF cosine similarity.
23 |
24 | Examples:
25 | >>> rltk.tfidf(['a', 'b', 'a'], ['a', 'c'], {'a':3, 'b':1, 'c':1}, 3)
26 | 0.17541160386140586
27 | >>> rltk.tfidf(['a', 'b', 'a'], ['a', 'c'], {'a':3, 'b':2, 'c':1}, 4, True)
28 | 0.12977804138
29 | >>> rltk.tfidf(['a', 'b', 'a'], ['a'], {'a':3, 'b':1, 'c':1}, 3)
30 | 0.5547001962252291
31 | """
32 | # http://www.tfidf.com/
33 |
34 | utils.check_for_none(bag1, bag2, df_corpus)
35 | utils.check_for_type(list, bag1, bag2)
36 |
37 | # term frequency for input strings
38 | t_x, t_y = collections.Counter(bag1), collections.Counter(bag2)
39 | tf_x = {k: float(v) / len(bag1) for k, v in t_x.items()}
40 | tf_y = {k: float(v) / len(bag2) for k, v in t_y.items()}
41 |
42 | # unique element
43 | total_unique_elements = set()
44 | total_unique_elements.update(bag1)
45 | total_unique_elements.update(bag2)
46 |
47 | idf_element, v_x, v_y, v_x_y, v_x_2, v_y_2 = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
48 |
49 | # tfidf calculation
50 | for element in total_unique_elements:
51 | if element not in df_corpus:
52 | continue
53 | idf_element = doc_size * 1.0 / df_corpus[element]
54 |
55 | v_x = 0 if element not in tf_x else (math.log(idf_element) * tf_x[element]) if math_log else (
56 | idf_element * tf_x[element])
57 | v_y = 0 if element not in tf_y else (math.log(idf_element) * tf_y[element]) if math_log else (
58 | idf_element * tf_y[element])
59 | v_x_y += v_x * v_y
60 | v_x_2 += v_x * v_x
61 | v_y_2 += v_y * v_y
62 |
63 | # cosine similarity
64 | return 0.0 if v_x_y == 0 else v_x_y / (math.sqrt(v_x_2) * math.sqrt(v_y_2))
65 |
66 |
67 | def compute_tf(tokens):
68 | """
69 | Compute TF (Term Frequency)
70 |
71 | Args:
72 | tokens (list): tokens
73 | """
74 | terms = collections.Counter(tokens)
75 | return {k: float(v) / len(tokens) for k, v in terms.items()}
76 |
77 |
78 | def compute_idf(df_corpus, doc_size, math_log=False):
79 | """
80 | Compute IDF (Inverted Document Frequency)
81 |
82 | Args:
83 | df_corpus (dict): terms in document
84 | doc_size (int): total document size
85 | math_log (bool): logarithm of the result
86 | """
87 | return {k: doc_size * 1.0 / v if math_log is False \
88 | else math.log(doc_size * 1.0 / v) \
89 | for k, v in df_corpus.items()}
90 |
91 |
92 | def tf_idf_cosine_similarity(tfidf_dict1, tfidf_dict2):
93 | """
94 | Compute Cosine similarity for TF/IDF value dictionary
95 |
96 | Args:
97 | tfidf_dict1 (dict): TF/IDF dictionary for first record, format in ``{term1: tfidf value, ...}``
98 | tfidf_dict2 (dict): TF/IDF dictionary for second record, same format as tfidf_dict1.
99 |
100 | Returns:
101 | float:
102 | """
103 | v_x_y, v_x_2, v_y_2 = 0.0, 0.0, 0.0
104 |
105 | # intersection of dict1 and dict2
106 | # ignore the values that are not in both
107 | for t in tfidf_dict1.keys():
108 | if t in tfidf_dict2:
109 | v_x_y += tfidf_dict1[t] * tfidf_dict2[t]
110 |
111 | for t, tfidf in tfidf_dict1.items():
112 | v_x_2 += tfidf * tfidf
113 | for t, tfidf in tfidf_dict2.items():
114 | v_y_2 += tfidf * tfidf
115 |
116 | # cosine similarity
117 | return 0.0 if v_x_y == 0 else v_x_y / (math.sqrt(v_x_2) * math.sqrt(v_y_2))
118 |
119 |
120 | class TF_IDF():
121 | """
122 | TF/IDF helper class (An efficient implementation)
123 |
124 | Examples::
125 |
126 | # initialization
127 | tfidf = TF_IDF()
128 | # add document
129 | tfidf.add_document('id1', ['a', 'b', 'a'])
130 | tfidf.add_document('id2', ['b', 'c'])
131 | tfidf.add_document('id3', ['b', 'd'])
132 | # compute idf
133 | tfidf.pre_compute()
134 | # get similarity
135 | tfidf.similarity('id1', 'id2')
136 | tfidf.similarity('id1', 'id3')
137 | """
138 |
139 | def __init__(self):
140 | self.tf = {}
141 | self.df_corpus = {}
142 | self.doc_size = 0
143 | self.idf = 0
144 |
145 | def add_document(self, doc_id: str, tokens: list):
146 | """
147 | Add document to corpus
148 |
149 | Args:
150 | doc_id (str): Document (record) id.
151 | tokens (list): List of token string.
152 | """
153 | self.doc_size += 1
154 | tf = compute_tf(tokens)
155 | self.tf[doc_id] = tf
156 | for k, _ in tf.items():
157 | self.df_corpus[k] = self.df_corpus.get(k, 0) + 1
158 |
159 | def pre_compute(self, math_log: bool = False):
160 | """
161 | Pre-compute IDF score
162 |
163 | Args:
164 | math_log (bool, optional): Flag to indicate whether math.log() should be used in TF and IDF formulas. Defaults to False.
165 | """
166 | self.idf = compute_idf(self.df_corpus, self.doc_size, math_log)
167 |
168 | def similarity(self, id1, id2):
169 | """
170 | Get similarity
171 |
172 | Args:
173 | id1 (str): id 1
174 | id2 (str): id2
175 |
176 | Returns:
177 | float:
178 | """
179 | tf_x = self.tf[id1]
180 | tfidf_x = {k: v * self.idf[k] for k, v in tf_x.items()}
181 | tf_y = self.tf[id2]
182 | tfidf_y = {k: v * self.idf[k] for k, v in tf_y.items()}
183 | return tf_idf_cosine_similarity(tfidf_x, tfidf_y)
184 |
--------------------------------------------------------------------------------
/rltk/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/usc-isi-i2/rltk/aee10ed5dd561583e60db3373ed82fe1208da1e9/rltk/tests/__init__.py
--------------------------------------------------------------------------------
/rltk/tests/test_blocking.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import random
3 |
4 | from rltk.record import Record
5 | from rltk.dataset import Dataset
6 | from rltk.io.reader.array_reader import ArrayReader
7 | from rltk.blocking.block_black_list import BlockBlackList
8 | from rltk.blocking.hash_block_generator import HashBlockGenerator
9 | from rltk.blocking.token_block_generator import TokenBlockGenerator
10 | from rltk.blocking.canopy_block_generator import CanopyBlockGenerator
11 | from rltk.blocking.sorted_neighbourhood_block_generator import SortedNeighbourhoodBlockGenerator
12 |
13 |
14 | class ConcreteRecord(Record):
15 | @property
16 | def id(self):
17 | return self.raw_object['id']
18 |
19 | @property
20 | def name(self):
21 | return self.raw_object['name']
22 |
23 | @property
24 | def category(self):
25 | return self.raw_object['category']
26 |
27 |
28 | raw_data = [
29 | {'id': '1', 'name': 'apple', 'category': 'a'},
30 | {'id': '2', 'name': 'banana', 'category': 'a'},
31 | {'id': '3', 'name': 'apple & banana', 'category': 'b'},
32 | {'id': '4', 'name': 'pineapple', 'category': 'b'},
33 | {'id': '5', 'name': 'peach', 'category': 'b'},
34 | {'id': '6', 'name': 'coconut', 'category': 'b'}
35 | ]
36 |
37 | ds = Dataset(reader=ArrayReader(raw_data), record_class=ConcreteRecord)
38 |
39 |
40 | def test_hash_block_generator():
41 | bg = HashBlockGenerator()
42 | block = bg.block(ds, property_='category')
43 | for key, set_ in block.key_set_adapter:
44 | if key == 'a':
45 | assert set_ == set([(ds.id, '1'), (ds.id, '2')])
46 | elif key == 'b':
47 | assert set_ == set([(ds.id, '3'), (ds.id, '4'), (ds.id, '5'), (ds.id, '6')])
48 | block = bg.block(ds, function_=lambda r: r.category)
49 | for key, set_ in block.key_set_adapter:
50 | if key == 'a':
51 | assert set_ == set([(ds.id, '1'), (ds.id, '2')])
52 | elif key == 'b':
53 | assert set_ == set([(ds.id, '3'), (ds.id, '4'), (ds.id, '5'), (ds.id, '6')])
54 |
55 | block_black_list = BlockBlackList(max_size=2)
56 | block = bg.block(ds, property_='category', block_black_list=block_black_list)
57 | for key, set_ in block.key_set_adapter:
58 | assert key == 'a'
59 | for key, _ in block_black_list.key_set_adapter:
60 | assert key == 'b'
61 |
62 |
63 | def test_token_block_generator():
64 | bg = TokenBlockGenerator()
65 | block = bg.block(ds, function_=lambda r: r.name.split(' '))
66 | for key, set_ in block.key_set_adapter:
67 | if key == 'apple':
68 | assert set_ == set([(ds.id, '1'), (ds.id, '3')])
69 | elif key == 'banana':
70 | assert set_ == set([(ds.id, '2'), (ds.id, '3')])
71 |
72 | block_black_list = BlockBlackList(max_size=1)
73 | block = bg.block(ds, function_=lambda r: r.name.split(' '), block_black_list=block_black_list)
74 | for key, set_ in block.key_set_adapter:
75 | assert len(set_) <= 1
76 | for key, _ in block_black_list.key_set_adapter:
77 | assert key in ('apple', 'banana')
78 |
79 |
80 | def test_canopy_block_generator():
81 | random.seed(0)
82 | bg = CanopyBlockGenerator(t1=5, t2=1, distance_metric=lambda x, y: abs(x[0] - y[0]))
83 | block = bg.block(ds, function_=lambda r: [ord(r.name[0].lower()) - 0x61])
84 | output_block = bg.generate(block, block)
85 | result = [
86 | ['4', '5'],
87 | ['1', '2', '3', '6'],
88 | ['2', '6'],
89 | ['6']
90 | ]
91 | for k, v in output_block.key_set_adapter:
92 | ids = [r[1] for r in v]
93 | assert sorted(ids) == sorted(result[k])
94 |
95 | def test_sorted_neighbourhood_block_generator():
96 | class SNConcreteRecord1(Record):
97 | @property
98 | def id(self):
99 | return self.raw_object['id']
100 |
101 | @property
102 | def char(self):
103 | return self.raw_object['char']
104 |
105 | class SNConcreteRecord2(SNConcreteRecord1):
106 | pass
107 |
108 | sn_raw_data_1 = [
109 | {'id': '11', 'char': 'a'},
110 | {'id': '12', 'char': 'd'},
111 | {'id': '13', 'char': 'c'},
112 | {'id': '14', 'char': 'e'},
113 | ]
114 |
115 | sn_raw_data_2 = [
116 | {'id': '21', 'char': 'b'},
117 | {'id': '22', 'char': 'a'},
118 | {'id': '23', 'char': 'e'},
119 | {'id': '24', 'char': 'f'},
120 | ]
121 |
122 | ds1 = Dataset(reader=ArrayReader(sn_raw_data_1), record_class=SNConcreteRecord1)
123 | ds2 = Dataset(reader=ArrayReader(sn_raw_data_2), record_class=SNConcreteRecord2)
124 |
125 | bg = SortedNeighbourhoodBlockGenerator(window_size=3)
126 | block = bg.generate(
127 | bg.block(ds1, property_='char'),
128 | bg.block(ds2, property_='char')
129 | )
130 |
131 | for block_id, set_ in block.key_set_adapter:
132 | block_data = []
133 | for did, rid in set_:
134 | if did == ds1.id:
135 | block_data.append(ds1.get_record(rid).char)
136 | else:
137 | block_data.append(ds2.get_record(rid).char)
138 | block_data.sort()
139 | for i in range(len(block_data) - 1):
140 | assert block_data[i] <= block_data[i+1] # should be less than or equal to previous char
141 |
--------------------------------------------------------------------------------
/rltk/tests/test_io_adapter.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pytest
3 | import redis
4 | import tempfile
5 | import shutil
6 |
7 | from rltk.record import Record
8 | from rltk.io.adapter import *
9 |
10 |
11 | class ConcreteRecord(Record):
12 |
13 | @property
14 | def id(self):
15 | return self.raw_object['id']
16 |
17 | @property
18 | def value(self):
19 | return self.raw_object['value']
20 |
21 |
22 | record = ConcreteRecord(raw_object={'id': 'id1', 'value': 'value1'})
23 |
24 |
25 | def _test_key_value_adapter(adapter):
26 | adapter.set(record.id, record)
27 | assert adapter.get(record.id).id == record.id
28 | assert adapter.get(record.id).value == record.value
29 | for rid, r in adapter:
30 | assert type(rid) == str
31 | assert rid == record.id
32 | assert r.id == record.id
33 | break
34 |
35 | assert adapter.get('no_such_key') is None
36 | adapter.clean()
37 |
38 |
39 | def test_memory_key_value_adapter():
40 | adapter = MemoryKeyValueAdapter()
41 | _test_key_value_adapter(adapter)
42 |
43 |
44 | def test_dbm_key_value_adapter():
45 | name = 'test_dbm_adapter'
46 | adapter = DbmKeyValueAdapter(name)
47 | _test_key_value_adapter(adapter)
48 | if os.path.exists(name + '.db'):
49 | os.remove(name + '.db')
50 |
51 |
52 | def test_redis_key_value_adapter():
53 | try:
54 | adapter = RedisKeyValueAdapter('127.0.0.1', key_prefix='rltk_test_redis_key_value_adapter_')
55 | _test_key_value_adapter(adapter)
56 | except redis.exceptions.ConnectionError:
57 | return
58 |
59 |
60 | def _test_key_set_adapter(adapter):
61 | adapter.set('a', set(['1', '2', '3']))
62 | assert adapter.get('a') == set(['1', '2', '3'])
63 | adapter.add('a', '4')
64 | assert adapter.get('a') == set(['1', '2', '3', '4'])
65 | adapter.remove('a', '4')
66 | assert adapter.get('a') == set(['1', '2', '3'])
67 | assert adapter.get('b') is None
68 | for k, v in adapter:
69 | assert type(k) == str
70 | assert k == 'a'
71 | assert v == set(['1', '2', '3'])
72 | break
73 | adapter.delete('a')
74 | assert adapter.get('a') is None
75 | adapter.set('c', set(['1', '2', '3']))
76 | adapter.clean()
77 | assert adapter.get('c') is None
78 |
79 |
80 | def test_memory_key_set_adapter():
81 | adapter = MemoryKeySetAdapter()
82 | _test_key_set_adapter(adapter)
83 |
84 |
85 | def test_leveldb_key_set_adapter():
86 | path = os.path.join(tempfile.gettempdir(), 'rltk_test_leveldb_key_set_adapter')
87 | adapter = LevelDbKeySetAdapter(path, name='test')
88 | _test_key_set_adapter(adapter)
89 |
90 | shutil.rmtree(path)
91 |
92 |
93 | def test_redis_key_set_adapter():
94 | try:
95 | adapter = RedisKeySetAdapter('127.0.0.1', key_prefix='rltk_test_redis_key_set_adapter_')
96 | _test_key_set_adapter(adapter)
97 | except redis.exceptions.ConnectionError:
98 | return
99 |
--------------------------------------------------------------------------------
/rltk/tests/test_io_reader.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import csv
3 | import json
4 | import io
5 |
6 | from rltk.io.reader import *
7 |
8 |
9 | arr = [{'1': 'A', '2': 'B'}, {'1': 'a', '2': 'b'}]
10 |
11 |
12 | def test_array_reader():
13 | for idx, obj in enumerate(ArrayReader(arr)):
14 | assert obj == arr[idx]
15 |
16 |
17 | def test_dataframe_reader():
18 | df = pd.DataFrame(arr)
19 | for idx, obj in enumerate(DataFrameReader(df)):
20 | assert obj == arr[idx]
21 |
22 |
23 | def test_dataframe_reader_keep_index():
24 | df = pd.DataFrame(arr)
25 | for idx, obj in enumerate(DataFrameReader(df, True)):
26 | assert obj == dict(**arr[idx], dataframe_default_index=idx)
27 |
28 |
29 | def test_csv_reader():
30 | f = io.StringIO()
31 |
32 | writer = csv.DictWriter(f, fieldnames=['1', '2'])
33 | writer.writeheader()
34 | for a in arr:
35 | writer.writerow(a)
36 |
37 | for idx, obj in enumerate(CSVReader(f)):
38 | assert obj == arr[idx]
39 |
40 | f.close()
41 |
42 |
43 | def test_jsonlines_reader():
44 | f = io.StringIO()
45 |
46 | for a in arr:
47 | f.write(json.dumps(a) + '\n')
48 |
49 | for idx, obj in enumerate(JsonLinesReader(f)):
50 | assert obj == arr[idx]
51 |
52 | f.close()
--------------------------------------------------------------------------------
/rltk/tests/test_trial.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from rltk.record import Record
4 | from rltk.evaluation.ground_truth import GroundTruth
5 | from rltk.evaluation.trial import Trial
6 | from rltk.similarity import *
7 |
8 |
9 | class ConcreteRecord(Record):
10 | @property
11 | def id(self):
12 | return self.raw_object['id']
13 |
14 | @property
15 | def data(self):
16 | return self.raw_object['data']
17 |
18 |
19 | @pytest.mark.parametrize('ground_truth_list, cal_result_list, min_c, top_k,tp,tn,fp,fn', [
20 | ([(1, 1, 10, 0, True), (2, 2, 11, 0, True), (3, 1, 12, 1, False), (4, 2, 13, 1, False)],
21 | [(1, 1, 10, 0, True, 0.5), (2, 2, 11, 0, False, 0.5), (3, 1, 12, 1, True, 0.5), (4, 2, 13, 1, False, 0.5)], 0, 0,
22 | 0.5, 0.5, 0.5,
23 | 0.5),
24 | # ([(1, 0, True), (2, 0, True), (1, 1, False), (2, 1, False)],
25 | # [(1, 0, True, 0.6), (2, 0, False, 0.5), (1, 1, True, 0.5), (2, 1, False, 0.6)], 0, 2, 1.0, 1.0, 0, 0)
26 | ])
27 | def test_basic(ground_truth_list, cal_result_list, min_c, top_k, tp, tn, fp, fn):
28 | # if not isinstance(ground_truth_list, (list)) or not isinstance(cal_result_list, (list)):
29 | # with pytest.raises(ValueError):
30 | # # number_equal(n1, n2)
31 | # else:
32 | do_test_trial(ground_truth_list, cal_result_list, min_c, top_k, tp, tn, fp, fn)
33 |
34 |
35 | def do_test_trial(ground_truth_list, cal_result_list, min_c, top_k, tp, tn, fp, fn):
36 | gt = GroundTruth()
37 |
38 | for r1_id, r1_d, r2_id, r2_d, p in ground_truth_list:
39 | raw_object = {'id': r1_id, 'data': r1_d}
40 | r1 = ConcreteRecord(raw_object)
41 | raw_object = {'id': r2_id, 'data': r2_d}
42 | r2 = ConcreteRecord(raw_object)
43 | gt.add_ground_truth(r1_id, r2_id, p)
44 |
45 | trial = Trial(gt, min_c, top_k)
46 | for r1_id, r1_d, r2_id, r2_d, p, c in cal_result_list:
47 | raw_object = {'id': r1_id, 'data': r1_d}
48 | r1 = ConcreteRecord(raw_object)
49 | raw_object = {'id': r2_id, 'data': r2_d}
50 | r2 = ConcreteRecord(raw_object)
51 | trial.add_result(r1, r2, p, c)
52 |
53 | trial.evaluate()
54 |
55 | assert trial.true_positives == tp
56 | assert trial.true_negatives == tn
57 | assert trial.false_positives == fp
58 | assert trial.false_negatives == fn
59 |
60 |
61 | @pytest.mark.parametrize('ground_truth_list, min_c, top_k, similarity_info, tp, tn, fp, fn', [
62 | ([('0', '', '10', 'abc', False), ('1', 'abc', '11', 'abc', True), ('2', 'abcd', '12', 'abc', False),
63 | ('3', 'abd', '13', 'abc', False)],
64 | 0, 0, [('levenshtein_similarity', 0.9), ('string_equal', 0.5)], 1.0, 1.0, 0, 0),
65 | ([('0', '', '10', 'abc', False), ('1', 'abc', '11', 'abc', True), ('2', 'abcd', '12', 'abc', False),
66 | ('3', 'abd', '13', 'abc', False)],
67 | 0, 2, [('levenshtein_similarity', 0.9), ('string_equal', 0.5)], 1.0, 1.0, 0, 0)
68 | ])
69 | def test_lvl(ground_truth_list, min_c, top_k, similarity_info, tp, tn, fp, fn):
70 | gt = GroundTruth()
71 |
72 | for r1_id, r1_d, r2_id, r2_d, p in ground_truth_list:
73 | raw_object = {'id': r1_id, 'data': r1_d}
74 | r1 = ConcreteRecord(raw_object)
75 | raw_object = {'id': r2_id, 'data': r2_d}
76 | r2 = ConcreteRecord(raw_object)
77 | gt.add_ground_truth(r1_id, r2_id, p)
78 |
79 | for similarity_function, min_confidence in similarity_info:
80 | trial = Trial(gt, min_confidence=min_c, top_k=top_k)
81 |
82 | i = 0
83 | for r1_id, r1_d, r2_id, r2_d, c in ground_truth_list:
84 | raw_object = {'id': r1_id, 'data': r1_d}
85 | r1 = ConcreteRecord(raw_object)
86 | raw_object = {'id': r2_id, 'data': r2_d}
87 | r2 = ConcreteRecord(raw_object)
88 |
89 | func_info = similarity_function + '("' + r1_d + '","' + r2_d + '")'
90 | c = eval(func_info)
91 | p = (c >= min_confidence)
92 | trial.add_result(r1, r2, p, c)
93 |
94 | trial.evaluate()
95 |
96 | assert trial.true_positives == tp
97 | assert trial.true_negatives == tn
98 | assert trial.false_positives == fp
99 | assert trial.false_negatives == fn
100 |
--------------------------------------------------------------------------------
/rltk/tokenizer/__init__.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 | from typing import List
3 | from rltk.tokenizer.crf_tokenizer import crf_tokenizer as dig_tokenizer
4 |
5 |
6 | class Tokenizer(ABC):
7 | """
8 | Abstract tokenizer
9 | """
10 |
11 | @abstractmethod
12 | def tokenize(self, s: str) -> List[str]:
13 | """
14 | Apply tokenizer
15 |
16 | Args:
17 | s (str): String to tokenize.
18 |
19 | Returns:
20 | List[str]: Tokenized list. It won't do token deduplication.
21 | """
22 | raise NotImplementedError
23 |
24 |
25 | class CRFTokenizer(Tokenizer):
26 | """
27 | CRFTokenizer: this uses old DIG CRFTokenizer
28 | """
29 |
30 | def __init__(self, *args, **kwargs) -> None:
31 | self._t = dig_tokenizer.CrfTokenizer(*args, **kwargs)
32 |
33 | def tokenize(self, s: str) -> List[str]:
34 | return self._t.tokenize(s)
35 |
36 |
37 | class WordTokenizer(Tokenizer):
38 | """
39 | Word Tokenizer: tokenize word by white space
40 |
41 | Args:
42 | remove_empty (bool, optional): If set, empty token will be removed. Defaults to False.
43 | """
44 |
45 | def __init__(self, remove_empty: bool = False) -> None:
46 | self._remove_empty = remove_empty
47 |
48 | def tokenize(self, s: str) -> List[str]:
49 | s = s.split(' ')
50 | if self._remove_empty:
51 | return list(filter(lambda x: len(x) != 0, s))
52 | else:
53 | return s
54 |
55 |
56 | class NGramTokenizer(Tokenizer):
57 | """
58 | NGrame Tokenizer
59 |
60 | Args:
61 | n (int): n.
62 | place_holder (str, optional): String to fill pad and separator. Defaults to white space (' ').
63 | padded (bool, optional): If set, head the tail will be filled with pad. Defaults to False.
64 | """
65 |
66 | def __init__(self, n: int, place_holder: str = ' ', padded: bool = False,
67 | base_tokenizer: Tokenizer = None) -> None:
68 | self._n = n
69 | self._place_holder = place_holder
70 | self._padded = padded
71 | self._base_tokenizer = base_tokenizer if base_tokenizer else WordTokenizer()
72 |
73 | def tokenize(self, s: str) -> List[str]:
74 | if len(s) == 0:
75 | return []
76 | if self._padded:
77 | pad = self._place_holder * (self._n - 1)
78 | s = pad + s + pad
79 | s = self._base_tokenizer.tokenize(s)
80 | s = self._place_holder.join(s)
81 | if len(s) < self._n:
82 | return [s]
83 | return [s[i:i + self._n] for i in range(len(s) - self._n + 1)]
84 |
--------------------------------------------------------------------------------
/rltk/tokenizer/crf_tokenizer/README.md:
--------------------------------------------------------------------------------
1 | # dig-crf-tokenizer
2 |
3 | The tokenization rules take into account embedded HTML tags and
4 | entities. HTML tags begin with "<" and end with ">". The contents of a
5 | tag are treated as a single token, although internal spaces, tabs, and
6 | newlines are stripped out so as not to confuse CRF++. HTML entities
7 | begin with "&" and end with ";", with certain characters allowed
8 | inbetween. They are treated as single tokens.
9 |
10 | HTML tags and HTML entities optionally can be skipped (omitted form the
11 | output array of tokens) after recognition.
12 |
13 | There are risks to the HTML processing rules when the text being
14 | tokenized is not proper HTML. Left angle brackets can cause the
15 | following text to become a single token. Ampersands can merge into
16 | the following textual word.
17 |
18 | A possible solution to the bare ampersand problem is to recognize only
19 | the defined set of HTML entities. It is harder to think of a solution
20 | to the bare left angle bracket problem; perhaps check if they are
21 | followed by the beginning of a valid HTML tag name?
22 |
23 | There is also special provision to group contiguous punctuation characters.
24 | The way to use this tokenizer is to create an instance of it, set any
25 | processing flags you need, then call the tokenize(value) function,
26 | which will return the tokens in an array.
27 |
28 | To tokenize, breaking on punctuation without recognizing HTML tags and
29 | entities, try:
30 | ```
31 | t = CrfTokenizer()
32 | tokens = t.tokenize(value)
33 | ```
34 |
35 | To tokenize, breaking on punctuation and recognizing both HTML tags and
36 | entites as special tokens, try:
37 | ```
38 | t = CrfTokenizer()
39 | t.setRecognizeHtmlEntities(True)
40 | t.setRecognizeHtmlTags(True)
41 | tokens = t.tokenize(value)
42 | ```
43 |
44 | To tokenize, breaking on punctuation, recognizing and HTML tags and
45 | entities, and skipping the tags, try:
46 | ```
47 | t = CrfTokenizer()
48 | t.setRecognizeHtmlEntities(True)
49 | t.setRecognizeHtmlTags(True)
50 | t.setSkipHtmlTags(True)
51 | tokens = t.tokenize(value)
52 | ```
53 |
54 | The following sequence will tokenize, strip HTML tags, then join the tokens
55 | into a string. The final result will be the input string with HTML entities
56 | treated as single tokens, HTML tags stripped out, punctuation separated from
57 | adjacent words, and excess white space removed.
58 | ```
59 | t = CrfTokenizer()
60 | t.setRecognizeHtmlEntities(True)
61 | t.setRecognizeHtmlTags(True)
62 | t.setSkipHtmlTags(True)
63 | result = t.tokenize(value).join(" ")
64 | ```
65 |
66 | The same as above, but with punctuation remaining glued to adjacent words:
67 | ```
68 | t = CrfTokenizer()
69 | t.setRecognizePunctuation(False)
70 | t.setRecognizeHtmlTags(True)
71 | t.setSkipHtmlTags(True)
72 | result = t.tokenize(value).join(" ")
73 | ```
74 |
--------------------------------------------------------------------------------
/rltk/tokenizer/crf_tokenizer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/usc-isi-i2/rltk/aee10ed5dd561583e60db3373ed82fe1208da1e9/rltk/tokenizer/crf_tokenizer/__init__.py
--------------------------------------------------------------------------------
/rltk/utils.py:
--------------------------------------------------------------------------------
1 | import unicodedata
2 | import warnings
3 |
4 | from typing import TYPE_CHECKING
5 | if TYPE_CHECKING:
6 | from rltk.dataset import Dataset
7 | from rltk.blocking.block import Block
8 | from rltk.evaluation.ground_truth import GroundTruth
9 |
10 |
11 | MAX_FLOAT = float('inf')
12 | MIN_FLOAT = float('-inf')
13 |
14 |
15 | def check_for_none(*args):
16 | for arg in args:
17 | if arg is None:
18 | raise ValueError('Missing parameter')
19 |
20 |
21 | def check_for_type(type, *args):
22 | for arg in args:
23 | if not isinstance(arg, type):
24 | raise TypeError('Wrong type of parameter')
25 |
26 |
27 | def unicode_normalize(s):
28 | return unicodedata.normalize('NFKD', s)
29 |
30 |
31 | def convert_list_to_set(s):
32 | if isinstance(s, list):
33 | s = set(s)
34 | return s
35 |
36 |
37 | def candidate_pairs(dataset1: 'Dataset',
38 | dataset2: 'Dataset' = None,
39 | block: 'Block' = None,
40 | ground_truth: 'GroundTruth' = None):
41 | """
42 | Generate candidate pairs to compare.
43 |
44 | Args:
45 | dataset1 (Dataset): dataset 1.
46 | dataset2 (Dataset, optional): dataset 2. If it's not provided, it will be a de-duplication task.
47 | block (Block, optional): Block.
48 | ground_truth (GroundTruth, optional): Ground truth.
49 | """
50 | if block and not ground_truth:
51 | if not dataset2:
52 | for _, id1, id2 in block.pairwise(dataset1.id):
53 | yield dataset1.get_record(id1), dataset1.get_record(id2)
54 | else:
55 | for _, id1, id2 in block.pairwise(dataset1.id, dataset2.id):
56 | yield dataset1.get_record(id1), dataset2.get_record(id2)
57 | elif ground_truth and not block:
58 | if not dataset2:
59 | for id1, id2, label in ground_truth:
60 | yield dataset1.get_record(id1), dataset1.get_record(id2)
61 | else:
62 | for id1, id2, label in ground_truth:
63 | yield dataset1.get_record(id1), dataset2.get_record(id2)
64 | elif ground_truth and block:
65 | if not dataset2:
66 | for _, id1, id2 in block.pairwise(dataset1.id):
67 | if ground_truth.is_member(id1, id2):
68 | yield dataset1.get_record(id1), dataset1.get_record(id2)
69 | else:
70 | for _, id1, id2 in block.pairwise(dataset1.id, dataset2.id):
71 | if ground_truth.is_member(id1, id2):
72 | yield dataset1.get_record(id1), dataset2.get_record(id2)
73 | else:
74 | if not dataset2:
75 | skip_offset = 0
76 | for r1 in dataset1:
77 | for offset, r2 in enumerate(dataset1):
78 | if offset < skip_offset:
79 | continue
80 | if r1.id == r2.id:
81 | continue
82 | yield r1, r2
83 | skip_offset += 1
84 | else:
85 | for r1 in dataset1:
86 | for r2 in dataset2:
87 | yield r1, r2
88 |
89 |
90 | get_record_pairs = candidate_pairs
91 |
92 |
93 | class ModuleImportWarning(UserWarning):
94 | pass
95 |
96 |
97 | def module_importer(module_names: str, dependencies: str, notes: str = None):
98 | if isinstance(dependencies, str):
99 | dependencies = [dependencies]
100 |
101 | def module():
102 | try:
103 | return __import__(module_names)
104 | except ImportError:
105 | warning_msg = '\n-----------------------------------\n'
106 | warning_msg += '\nImport Dependencies Error\n'
107 |
108 | if len(dependencies) > 0:
109 | warning_msg += '\nPlease install dependencies:\n'
110 | for d in dependencies:
111 | warning_msg += d + '\n'
112 |
113 | if notes:
114 | warning_msg += notes
115 |
116 | warning_msg += '\n-----------------------------------'
117 | warnings.warn(warning_msg, ModuleImportWarning)
118 | exit(500)
119 |
120 | return module
121 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from distutils.core import setup
2 | from setuptools import find_packages
3 |
4 |
5 | with open('rltk/__init__.py', 'r') as f:
6 | for line in f:
7 | if line.startswith('__version__'):
8 | exec(line) # fetch and create __version__
9 | break
10 |
11 | with open('README.rst', 'r', encoding='utf-8') as f:
12 | long_description = f.read()
13 |
14 | with open('requirements.txt', 'r') as f:
15 | install_requires = list()
16 | dependency_links = list()
17 | for line in f:
18 | re = line.strip()
19 | if re:
20 | if re.startswith('git+') or re.startswith('svn+') or re.startswith('hg+'):
21 | dependency_links.append(re)
22 | else:
23 | install_requires.append(re)
24 |
25 | packages = find_packages()
26 |
27 | setup(
28 | name='rltk',
29 | version=__version__,
30 | packages=packages,
31 | url='https://github.com/usc-isi-i2/rltk',
32 | project_urls={
33 | "Bug Tracker": "https://github.com/usc-isi-i2/rltk/issues",
34 | "Documentation": "https://rltk.readthedocs.io",
35 | "Source Code": "https://github.com/usc-isi-i2/rltk",
36 | },
37 | license='MIT',
38 | author='USC/ISI',
39 | author_email='yixiangy@isi.edu',
40 | description='Record Linkage ToolKit',
41 | long_description=long_description,
42 | long_description_content_type='text/x-rst',
43 | include_package_data=True,
44 | install_requires=install_requires,
45 | dependency_links=dependency_links,
46 | classifiers=(
47 | "Programming Language :: Python :: 3",
48 | "Natural Language :: English",
49 | "License :: OSI Approved :: MIT License",
50 | "Operating System :: OS Independent",
51 | "Topic :: Scientific/Engineering",
52 | "Topic :: Scientific/Engineering :: Information Analysis",
53 | "Topic :: Software Development :: Libraries",
54 | "Topic :: Software Development :: Libraries :: Python Modules",
55 | "Intended Audience :: Science/Research",
56 | "Intended Audience :: Developers",
57 | "Intended Audience :: Education",
58 | "Intended Audience :: Information Technology"
59 | )
60 | )
61 |
--------------------------------------------------------------------------------