├── CHANGES.rst ├── MANIFEST.in ├── hash.npz ├── weights.npz ├── lshashpy3 ├── __init__.py ├── storage.py └── lshash.py ├── LICENSE ├── setup.py ├── .gitignore ├── example.py └── README.rst /CHANGES.rst: -------------------------------------------------------------------------------- 1 | :Version: 0.0.8 2 | :Python: 3.7.7 -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.rst 2 | include CHANGES.rst 3 | include LICENSE -------------------------------------------------------------------------------- /hash.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/loretoparisi/lshash/HEAD/hash.npz -------------------------------------------------------------------------------- /weights.npz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/loretoparisi/lshash/HEAD/weights.npz -------------------------------------------------------------------------------- /lshashpy3/__init__.py: -------------------------------------------------------------------------------- 1 | import pkg_resources 2 | 3 | __version__ = '0.0.9' 4 | 5 | from .lshash import * 6 | from .storage import * 7 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Loreto Parisi 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | try: 4 | from setuptools import setup 5 | except ImportError: 6 | from distutils.core import setup 7 | 8 | from os import path 9 | this_directory = path.abspath(path.dirname(__file__)) 10 | with open(path.join(this_directory, 'README.rst'), encoding='utf-8') as f: 11 | readme = f.read() 12 | 13 | with open(path.join(this_directory, 'CHANGES.rst'), encoding='utf-8') as f: 14 | changes = f.read() 15 | 16 | required = ['future', 'six', 'numpy', 'bitarray'] 17 | 18 | setup( 19 | name='lshashpy3', 20 | version='0.0.9', 21 | packages=['lshashpy3'], 22 | author='Kay Zhu', 23 | author_email='me@kayzhu.com', 24 | maintainer='Loreto Parisi', 25 | maintainer_email='loretoparisi@gmail.com', 26 | url="https://github.com/loretoparisi/lshash", 27 | description='A fast Python 3 implementation of locality sensitive hashing with persistance support.', 28 | long_description=readme + '\n\n' + changes, 29 | long_description_content_type='text/x-rst', 30 | license='MIT License', 31 | install_requires=required, 32 | classifiers=[ 33 | 'Intended Audience :: Developers', 34 | 'License :: OSI Approved :: MIT License', 35 | 'Operating System :: OS Independent', 36 | 'Programming Language :: Python', 37 | 'Programming Language :: Python :: 3', 38 | 'Topic :: Software Development :: Libraries', 39 | ] 40 | ) 41 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /lshashpy3/storage.py: -------------------------------------------------------------------------------- 1 | # lshash/storage.py 2 | # Copyright 2012 Kay Zhu (a.k.a He Zhu) and contributors (see CONTRIBUTORS.txt) 3 | # 4 | # This module is part of lshash and is released under 5 | # the MIT License: http://www.opensource.org/licenses/mit-license.php 6 | 7 | import json 8 | 9 | try: 10 | import redis 11 | except ImportError: 12 | redis = None 13 | 14 | __all__ = ['storage'] 15 | 16 | 17 | def storage(storage_config, index): 18 | """ Given the configuration for storage and the index, return the 19 | configured storage instance. 20 | """ 21 | if 'dict' in storage_config: 22 | return InMemoryStorage(storage_config['dict']) 23 | elif 'redis' in storage_config: 24 | storage_config['redis']['db'] = index 25 | return RedisStorage(storage_config['redis']) 26 | else: 27 | raise ValueError("Only in-memory dictionary and Redis are supported.") 28 | 29 | 30 | class BaseStorage(object): 31 | def __init__(self, config): 32 | """ An abstract class used as an adapter for storages. """ 33 | raise NotImplementedError 34 | 35 | def keys(self): 36 | """ Returns a list of binary hashes that are used as dict keys. """ 37 | raise NotImplementedError 38 | 39 | def set_val(self, key, val): 40 | """ Set `val` at `key`, note that the `val` must be a string. """ 41 | raise NotImplementedError 42 | 43 | def get_val(self, key): 44 | """ Return `val` at `key`, note that the `val` must be a string. """ 45 | raise NotImplementedError 46 | 47 | def append_val(self, key, val): 48 | """ Append `val` to the list stored at `key`. 49 | 50 | If the key is not yet present in storage, create a list with `val` at 51 | `key`. 52 | """ 53 | raise NotImplementedError 54 | 55 | def get_list(self, key): 56 | """ Returns a list stored in storage at `key`. 57 | 58 | This method should return a list of values stored at `key`. `[]` should 59 | be returned if the list is empty or if `key` is not present in storage. 60 | """ 61 | raise NotImplementedError 62 | 63 | 64 | class InMemoryStorage(BaseStorage): 65 | def __init__(self, config): 66 | self.name = 'dict' 67 | self.storage = dict() 68 | 69 | def keys(self): 70 | return self.storage.keys() 71 | 72 | def set_val(self, key, val): 73 | self.storage[key] = val 74 | 75 | def get_val(self, key): 76 | return self.storage[key] 77 | 78 | def append_val(self, key, val): 79 | self.storage.setdefault(key, []).append(val) 80 | 81 | def get_list(self, key): 82 | return self.storage.get(key, []) 83 | 84 | 85 | class RedisStorage(BaseStorage): 86 | def __init__(self, config): 87 | if not redis: 88 | raise ImportError("redis-py is required to use Redis as storage.") 89 | self.name = 'redis' 90 | self.storage = redis.StrictRedis(**config) 91 | 92 | def keys(self, pattern="*"): 93 | return self.storage.keys(pattern) 94 | 95 | def set_val(self, key, val): 96 | self.storage.set(key, val) 97 | 98 | def get_val(self, key): 99 | return self.storage.get(key) 100 | 101 | def append_val(self, key, val): 102 | self.storage.rpush(key, json.dumps(val)) 103 | 104 | def get_list(self, key): 105 | return self.storage.lrange(key, 0, -1) -------------------------------------------------------------------------------- /example.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from lshashpy3 import LSHash 4 | from multiprocessing import cpu_count 5 | 6 | # create 6-bit hashes for input data of 8 dimensions: 7 | k = 6 # hash size 8 | L = 5 # number of tables 9 | d = 8 # Dimension of Feature vector 10 | lsh = LSHash(hash_size=k, input_dim=d, num_hashtables=L) 11 | 12 | # index vector 13 | lsh.index([2,3,4,5,6,7,8,9]) 14 | 15 | # get the binary hash for an input point by iterating through all tables 16 | binary_hashes = lsh.get_hashes([2,3,4,5,6,7,8,9]) 17 | print("hash representation", binary_hashes) 18 | 19 | # checking that each table stores the same input vector with different keys 20 | for key, table in zip(binary_hashes, lsh.hash_tables): 21 | print(key, table.get_list(key)) 22 | 23 | # index vector and extra data 24 | lsh.index([10,12,99,1,5,31,2,3], extra_data="vec1") 25 | lsh.index([10,11,94,1,4,31,2,3], extra_data="vec2") 26 | 27 | # query a data point 28 | top_n = 1 29 | nn = lsh.query([1,2,3,4,5,6,7,7], num_results=top_n, distance_func="euclidean") 30 | print("query (euclidean):", nn) 31 | 32 | # query a data point 33 | top_n = 1 34 | nn = lsh.query([1,2,3,4,5,6,7,7], num_results=top_n, distance_func="hamming") 35 | print("query (hamming):", nn) 36 | 37 | # unpack vector, extra data and vectorial distance 38 | # distance_func can be "euclidean", "true_euclidean", "centred_euclidean", "cosine", "l1norm". 39 | top_n = 3 40 | nn = lsh.query([10,12,99,1,5,30,1,1], num_results=top_n, distance_func="euclidean") 41 | for ((vec,extra_data),distance) in nn: 42 | print(vec, extra_data, distance) 43 | 44 | # InMemoryStorage 45 | lsh = LSHash(hash_size=k, input_dim=d, num_hashtables=L, 46 | storage_config={ 'dict': None }, matrices_filename='weights.npz', overwrite=False) 47 | 48 | # local storage for numpy uniform random planes, overwrite matrix file 49 | lsh = LSHash(hash_size=k, input_dim=d, num_hashtables=L, 50 | storage_config={ 'dict': None }, 51 | matrices_filename='weights.npz', 52 | hashtable_filename='hash.npz', 53 | overwrite=True) 54 | 55 | lsh.index([10,12,99,1,5,31,2,3], extra_data="vec1") 56 | lsh.index([10,11,94,1,4,31,2,3], extra_data="vec2") 57 | 58 | top_n = 3 59 | nn = lsh.query([10,12,99,1,5,30,1,1], num_results=top_n, distance_func="euclidean") 60 | print("query (euclidean):", nn) 61 | 62 | # save hash table to disk 63 | lsh.save() 64 | 65 | # local storage for numpy uniform random planes, overwrite matrix file 66 | lsh = LSHash(hash_size=k, input_dim=d, num_hashtables=L, 67 | storage_config={ 'dict': None }, 68 | matrices_filename='weights.npz', 69 | hashtable_filename='hash.npz', 70 | overwrite=True) 71 | 72 | # execute a query loading hash table from local file system 73 | top_n = 3 74 | nn = lsh.query([10,12,99,1,5,30,1,1], num_results=top_n, distance_func="euclidean") 75 | print("query from disk (euclidean):", nn) 76 | 77 | # Example: Index multiple items using multiprocessing 78 | input_points = [ 79 | [2, 3, 4, 5, 6, 7, 8, 9], 80 | [10, 12, 99, 1, 5, 31, 2, 3], 81 | [10, 11, 94, 1, 4, 31, 2, 3], 82 | [1, 2, 3, 4, 5, 6, 7, 7], 83 | [10, 12, 99, 1, 5, 30, 1, 1] 84 | ] 85 | extra_data_list = ["vec1", "vec2", "vec3", "vec4", "vec5"] 86 | 87 | # Indexing in parallel 88 | print(f"Using {cpu_count()} CPU cores for multiprocessing...") 89 | lsh.index_batch(input_points, extra_data_list) 90 | 91 | # Verify the indexed data 92 | for point, extra_data in zip(input_points, extra_data_list): 93 | hashes = lsh.get_hashes(point) 94 | print(f"Point: {point}, Extra Data: {extra_data}, Hashes: {hashes}") -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | LSHash 2 | ====== 3 | 4 | :Version: 0.0.9 5 | :Python: 3.11.5 6 | 7 | A fast Python implementation of locality sensitive hashing with persistance 8 | support. 9 | 10 | Based on original source code https://github.com/kayzhu/LSHash 11 | 12 | Highlights 13 | ========== 14 | 15 | - Python3 support 16 | - Load & save hash tables to local disk 17 | - Fast hash calculation for large amount of high dimensional data through the use of `numpy` arrays. 18 | - Built-in support for persistency through Redis. 19 | - Multiple hash indexes support. 20 | - Built-in support for common distance/objective functions for ranking outputs. 21 | 22 | Installation 23 | ============ 24 | ``LSHash`` depends on the following libraries: 25 | 26 | - numpy 27 | - bitarray (if hamming distance is used as distance function) 28 | 29 | Optional 30 | - redis (if persistency through Redis is needed) 31 | 32 | To install from sources: 33 | 34 | .. code-block:: bash 35 | 36 | $ git clone https://github.com/loretoparisi/lshash.git 37 | $ python setup.py install 38 | 39 | To install from PyPI: 40 | 41 | .. code-block:: bash 42 | 43 | $ pip install lshashpy3 44 | $ python -c "import lshashpy3 as lshash; print(lshash.__version__);" 45 | 46 | Quickstart 47 | ========== 48 | To create 6-bit hashes for input data of 8 dimensions: 49 | 50 | .. code-block:: python 51 | 52 | # create 6-bit hashes for input data of 8 dimensions: 53 | lsh = LSHash(6, 8) 54 | 55 | # index vector 56 | lsh.index([2,3,4,5,6,7,8,9]) 57 | 58 | # index vector and extra data 59 | lsh.index([10,12,99,1,5,31,2,3], extra_data="vec1") 60 | lsh.index([10,11,94,1,4,31,2,3], extra_data="vec2") 61 | 62 | # query a data point 63 | top_n = 1 64 | nn = lsh.query([1,2,3,4,5,6,7,7], num_results=top_n, distance_func="euclidean") 65 | print(nn) 66 | 67 | # unpack vector, extra data and vectorial distance 68 | top_n = 3 69 | nn = lsh.query([10,12,99,1,5,30,1,1], num_results=top_n, distance_func="euclidean") 70 | for ((vec,extra_data),distance) in nn: 71 | print(vec, extra_data, distance) 72 | 73 | 74 | To save hash table to disk: 75 | 76 | .. code-block:: python 77 | 78 | lsh = LSHash(hash_size=k, input_dim=d, num_hashtables=L, 79 | storage_config={ 'dict': None }, 80 | matrices_filename='weights.npz', 81 | hashtable_filename='hash.npz', 82 | overwrite=True) 83 | 84 | lsh.index([10,12,99,1,5,31,2,3], extra_data="vec1") 85 | lsh.index([10,11,94,1,4,31,2,3], extra_data="vec2") 86 | lsh.save() 87 | 88 | To load hash table from disk and perform a query: 89 | 90 | .. code-block:: python 91 | 92 | lsh = LSHash(hash_size=k, input_dim=d, num_hashtables=L, 93 | storage_config={ 'dict': None }, 94 | matrices_filename='weights.npz', 95 | hashtable_filename='hash.npz', 96 | overwrite=False) 97 | 98 | top_n = 3 99 | nn = lsh.query([10,12,99,1,5,30,1,1], num_results=top_n, distance_func="euclidean") 100 | print(nn) 101 | 102 | New Feature: Multiprocessing Support 103 | =================================== 104 | 105 | The library now supports indexing multiple items in parallel using the `index_batch` method. This feature leverages Python's `multiprocessing` module to speed up the indexing process for large datasets. 106 | 107 | Example: Using Multiprocessing for Batch Indexing 108 | -------------------------------------------------- 109 | 110 | To index multiple items in parallel: 111 | 112 | .. code-block:: python 113 | 114 | from lshashpy3 import LSHash 115 | 116 | # Create an LSHash instance 117 | lsh = LSHash(hash_size=6, input_dim=8, num_hashtables=5) 118 | 119 | # Define input points and optional extra data 120 | input_points = [ 121 | [2, 3, 4, 5, 6, 7, 8, 9], 122 | [10, 12, 99, 1, 5, 31, 2, 3], 123 | [10, 11, 94, 1, 4, 31, 2, 3], 124 | [1, 2, 3, 4, 5, 6, 7, 7], 125 | [10, 12, 99, 1, 5, 30, 1, 1] 126 | ] 127 | extra_data_list = ["vec1", "vec2", "vec3", "vec4", "vec5"] 128 | 129 | # Index the points in parallel 130 | lsh.index_batch(input_points, extra_data_list) 131 | 132 | # Verify the indexed data 133 | for point, extra_data in zip(input_points, extra_data_list): 134 | hashes = lsh.get_hashes(point) 135 | print(f"Point: {point}, Extra Data: {extra_data}, Hashes: {hashes}") 136 | 137 | API 138 | ============== 139 | 140 | - To initialize a ``LSHash`` instance: 141 | 142 | .. code-block:: python 143 | 144 | k = 6 # hash size 145 | L = 5 # number of tables 146 | d = 8 # Dimension of Feature vector 147 | LSHash(hash_size=k, input_dim=d, num_hashtables=L, 148 | storage_config={ 'dict': None }, 149 | matrices_filename='weights.npz', 150 | hashtable_filename='hash.npz', 151 | overwrite=True) 152 | 153 | parameters: 154 | 155 | ``hash_size``: 156 | The length of the resulting binary hash. 157 | ``input_dim``: 158 | The dimension of the input vector. 159 | ``num_hashtables = 1``: 160 | (optional) The number of hash tables used for multiple lookups. 161 | ``storage = None``: 162 | (optional) Specify the name of the storage to be used for the index 163 | storage. Options include "redis". 164 | ``matrices_filename = None``: 165 | (optional) Specify the path to the .npz file random matrices are stored 166 | or to be stored if the file does not exist yet 167 | ``hashtable_filename = None``: 168 | (optional) Specify the path to the .npz file hash table are stored 169 | or to be stored if the file does not exist yet 170 | ``overwrite = False``: 171 | (optional) Whether to overwrite the matrices file if it already exist 172 | 173 | - To index a data point of a given ``LSHash`` instance, e.g., ``lsh``: 174 | 175 | .. code-block:: python 176 | 177 | lsh.index(input_point, extra_data=None): 178 | 179 | parameters: 180 | 181 | ``input_point``: 182 | The input data point is an array or tuple of numbers of input_dim. 183 | ``extra_data = None``: 184 | (optional) Extra data to be added along with the input_point. 185 | 186 | - To query a data point against a given ``LSHash`` instance, e.g., ``lsh``: 187 | 188 | .. code-block:: python 189 | 190 | lsh.query(query_point, num_results=None, distance_func="euclidean"): 191 | 192 | parameters: 193 | 194 | ``query_point``: 195 | The query data point is an array or tuple of numbers of input_dim. 196 | ``num_results = None``: 197 | (optional) The number of query results to return in ranked order. By 198 | default all results will be returned. 199 | ``distance_func = "euclidean"``: 200 | (optional) Distance function to use to rank the candidates. By default 201 | "euclidean" distance function will be used. Distance function can be 202 | "euclidean", "true_euclidean", "centred_euclidean", "cosine", "l1norm". 203 | 204 | 205 | - To save the hash table currently indexed: 206 | 207 | .. code-block:: python 208 | 209 | lsh.save(): 210 | -------------------------------------------------------------------------------- /lshashpy3/lshash.py: -------------------------------------------------------------------------------- 1 | # lshash/lshash.py 2 | # Copyright 2012 Kay Zhu (a.k.a He Zhu) and contributors (see CONTRIBUTORS.txt) 3 | # 4 | # This module is part of lshash and is released under 5 | # the MIT License: http://www.opensource.org/licenses/mit-license.php 6 | # -*- coding: utf-8 -*- 7 | from __future__ import print_function, unicode_literals, division, absolute_import 8 | from builtins import int, round, str, object # noqa 9 | 10 | try: 11 | basestring 12 | except NameError: 13 | basestring = str 14 | 15 | #import future # noqa 16 | import builtins # noqa 17 | #import past # noqa 18 | import six # noqa 19 | 20 | import os 21 | import json 22 | import numpy as np 23 | from multiprocessing import Pool, cpu_count 24 | 25 | try: 26 | from storage import storage # py2 27 | except ImportError: 28 | from .storage import storage # py3 29 | 30 | try: 31 | from bitarray import bitarray 32 | except ImportError: 33 | bitarray = None 34 | 35 | try: 36 | xrange # py2 37 | except NameError: 38 | xrange = range # py3 39 | 40 | 41 | class LSHash(object): 42 | """ LSHash implments locality sensitive hashing using random projection for 43 | input vectors of dimension `input_dim`. 44 | 45 | Attributes: 46 | 47 | :param hash_size: 48 | The length of the resulting binary hash in integer. E.g., 32 means the 49 | resulting binary hash will be 32-bit long. 50 | :param input_dim: 51 | The dimension of the input vector. E.g., a grey-scale picture of 30x30 52 | pixels will have an input dimension of 900. 53 | :param num_hashtables: 54 | (optional) The number of hash tables used for multiple lookups. 55 | :param storage_config: 56 | (optional) A dictionary of the form `{backend_name: config}` where 57 | `backend_name` is the either `dict` or `redis`, and `config` is the 58 | configuration used by the backend. For `redis` it should be in the 59 | format of `{"redis": {"host": hostname, "port": port_num}}`, where 60 | `hostname` is normally `localhost` and `port` is normally 6379. 61 | :param matrices_filename: 62 | (optional) Specify the path to the compressed numpy file ending with 63 | extension `.npz`, where the uniform random planes are stored, or to be 64 | stored if the file does not exist yet. 65 | :param overwrite: 66 | (optional) Whether to overwrite the matrices file if it already exist 67 | """ 68 | 69 | def __init__(self, hash_size, input_dim, num_hashtables=1, 70 | storage_config=None, matrices_filename=None, hashtable_filename=None, overwrite=False): 71 | 72 | self.hash_size = hash_size 73 | self.input_dim = input_dim 74 | self.num_hashtables = num_hashtables 75 | 76 | if storage_config is None: 77 | storage_config = {'dict': None} 78 | self.storage_config = storage_config 79 | 80 | if matrices_filename and not matrices_filename.endswith('.npz'): 81 | raise ValueError("The specified file name must end with .npz") 82 | self.matrices_filename = matrices_filename 83 | 84 | if hashtable_filename and not hashtable_filename.endswith('.npz'): 85 | raise ValueError("The specified file name must end with .npz") 86 | self.hashtable_filename = hashtable_filename 87 | 88 | self.overwrite = overwrite 89 | 90 | self._init_uniform_planes() 91 | self._init_hashtables() 92 | 93 | def _init_uniform_planes(self): 94 | """ Initialize uniform planes used to calculate the hashes 95 | 96 | if file `self.matrices_filename` exist and `self.overwrite` is 97 | selected, save the uniform planes to the specified file. 98 | 99 | if file `self.matrices_filename` exist and `self.overwrite` is not 100 | selected, load the matrix with `np.load`. 101 | 102 | if file `self.matrices_filename` does not exist and regardless of 103 | `self.overwrite`, only set `self.uniform_planes`. 104 | """ 105 | 106 | if "uniform_planes" in self.__dict__: 107 | return 108 | 109 | if self.matrices_filename: 110 | file_exist = os.path.isfile(self.matrices_filename) 111 | if file_exist and not self.overwrite: 112 | try: 113 | npzfiles = np.load(self.matrices_filename) 114 | except IOError: 115 | print("Cannot load specified file as a numpy array") 116 | raise 117 | else: 118 | npzfiles = sorted(npzfiles.items(), key=lambda x: x[0]) 119 | self.uniform_planes = [t[1] for t in npzfiles] 120 | else: 121 | self.uniform_planes = [self._generate_uniform_planes() 122 | for _ in xrange(self.num_hashtables)] 123 | try: 124 | np.savez_compressed(self.matrices_filename, 125 | *self.uniform_planes) 126 | except IOError: 127 | print("IOError when saving matrices to specificed path") 128 | raise 129 | else: 130 | self.uniform_planes = [self._generate_uniform_planes() 131 | for _ in xrange(self.num_hashtables)] 132 | 133 | def _init_hashtables(self): 134 | """ Initialize the hash tables such that each record will be in the 135 | form of "[storage1, storage2, ...]" """ 136 | 137 | if self.hashtable_filename: 138 | file_exist = os.path.isfile(self.hashtable_filename) 139 | if file_exist: 140 | try: 141 | npzfiles = np.load(self.hashtable_filename, allow_pickle=True) 142 | self.hash_tables = npzfiles['data'] 143 | except IOError: 144 | print("Cannot load specified file as a numpy array") 145 | raise 146 | else: 147 | self.hash_tables = [storage(self.storage_config, i) 148 | for i in xrange(self.num_hashtables)] 149 | else: 150 | self.hash_tables = [storage(self.storage_config, i) 151 | for i in xrange(self.num_hashtables)] 152 | 153 | def _generate_uniform_planes(self): 154 | """ Generate uniformly distributed hyperplanes and return it as a 2D 155 | numpy array. 156 | """ 157 | 158 | return np.random.randn(self.hash_size, self.input_dim) 159 | 160 | def _hash(self, planes, input_point): 161 | """ Generates the binary hash for `input_point` and returns it. 162 | 163 | :param planes: 164 | The planes are random uniform planes with a dimension of 165 | `hash_size` * `input_dim`. 166 | :param input_point: 167 | A Python tuple or list object that contains only numbers. 168 | The dimension needs to be 1 * `input_dim`. 169 | """ 170 | 171 | try: 172 | input_point = np.array(input_point) # for faster dot product 173 | projections = np.dot(planes, input_point) 174 | except TypeError as e: 175 | print("""The input point needs to be an array-like object with 176 | numbers only elements""") 177 | raise 178 | except ValueError as e: 179 | print("""The input point needs to be of the same dimension as 180 | `input_dim` when initializing this LSHash instance""", e) 181 | raise 182 | else: 183 | return "".join(['1' if i > 0 else '0' for i in projections]) 184 | 185 | def _as_np_array(self, json_or_tuple): 186 | """ Takes either a JSON-serialized data structure or a tuple that has 187 | the original input points stored, and returns the original input point 188 | in numpy array format. 189 | """ 190 | if isinstance(json_or_tuple, basestring): 191 | # JSON-serialized in the case of Redis 192 | try: 193 | # Return the point stored as list, without the extra data 194 | tuples = json.loads(json_or_tuple)[0] 195 | except TypeError: 196 | print("The value stored is not JSON-serilizable") 197 | raise 198 | else: 199 | # If extra_data exists, `tuples` is the entire 200 | # (point:tuple, extra_data). Otherwise (i.e., extra_data=None), 201 | # return the point stored as a tuple 202 | tuples = json_or_tuple 203 | 204 | if isinstance(tuples[0], tuple): 205 | # in this case extra data exists 206 | return np.asarray(tuples[0]) 207 | 208 | elif isinstance(tuples, (tuple, list)): 209 | try: 210 | return np.asarray(tuples) 211 | except ValueError as e: 212 | print("The input needs to be an array-like object", e) 213 | raise 214 | else: 215 | raise TypeError("query data is not supported") 216 | 217 | def index(self, input_point, extra_data=None): 218 | """ Index a single input point by adding it to the selected storage. 219 | 220 | If `extra_data` is provided, it will become the value of the dictionary 221 | {input_point: extra_data}, which in turn will become the value of the 222 | hash table. `extra_data` needs to be JSON serializable if in-memory 223 | dict is not used as storage. 224 | 225 | :param input_point: 226 | A list, or tuple, or numpy ndarray object that contains numbers 227 | only. The dimension needs to be 1 * `input_dim`. 228 | This object will be converted to Python tuple and stored in the 229 | selected storage. 230 | :param extra_data: 231 | (optional) Needs to be a JSON-serializable object: list, dicts and 232 | basic types such as strings and integers. 233 | """ 234 | 235 | if isinstance(input_point, np.ndarray): 236 | input_point = input_point.tolist() 237 | 238 | if extra_data is not None: 239 | value = (tuple(input_point), extra_data) 240 | else: 241 | # LP: extra_data to None to keep output consistency 242 | value = (tuple(input_point), None) 243 | 244 | hashes = [] 245 | for i, table in enumerate(self.hash_tables): 246 | h = self._hash(self.uniform_planes[i], input_point) 247 | table.append_val(h, value) 248 | hashes.append(h) 249 | return hashes 250 | 251 | def index_batch(self, input_points, extra_data_list=None): 252 | """ 253 | Index multiple input points in parallel using multiprocessing. 254 | 255 | :param input_points: List of input points to index. 256 | :param extra_data_list: (optional) List of extra data corresponding to each input point. 257 | """ 258 | if extra_data_list is None: 259 | extra_data_list = [None] * len(input_points) 260 | 261 | if len(input_points) != len(extra_data_list): 262 | raise ValueError("input_points and extra_data_list must have the same length") 263 | 264 | # Prepare data for multiprocessing 265 | data = list(zip(input_points, extra_data_list)) 266 | 267 | # Use multiprocessing Pool to index data in parallel 268 | with Pool(cpu_count()) as pool: 269 | pool.starmap(self.index, data) 270 | 271 | def save(self): 272 | """ 273 | Save save the uniform planes to the specified file. 274 | """ 275 | if self.hashtable_filename: 276 | try: 277 | np.savez_compressed(self.hashtable_filename, allow_pickle=True, data=self.hash_tables) 278 | 279 | except IOError: 280 | print("IOError when saving hash tables to specificed path") 281 | raise 282 | 283 | def query(self, query_point, num_results=None, distance_func=None): 284 | """ Takes `query_point` which is either a tuple or a list of numbers, 285 | returns `num_results` of results as a list of tuples that are ranked 286 | based on the supplied metric function `distance_func`. 287 | 288 | :param query_point: 289 | A list, or tuple, or numpy ndarray that only contains numbers. 290 | The dimension needs to be 1 * `input_dim`. 291 | Used by :meth:`._hash`. 292 | :param num_results: 293 | (optional) Integer, specifies the max amount of results to be 294 | returned. If not specified all candidates will be returned as a 295 | list in ranked order. 296 | :param distance_func: 297 | (optional) The distance function to be used. Currently it needs to 298 | be one of ("hamming", "euclidean", "true_euclidean", 299 | "centred_euclidean", "cosine", "l1norm"). By default "euclidean" 300 | will used. 301 | """ 302 | 303 | candidates = set() 304 | if not distance_func: 305 | distance_func = "euclidean" 306 | 307 | if distance_func == "hamming": 308 | if not bitarray: 309 | raise ImportError(" Bitarray is required for hamming distance") 310 | 311 | for i, table in enumerate(self.hash_tables): 312 | binary_hash = self._hash(self.uniform_planes[i], query_point) 313 | for key in table.keys(): 314 | distance = LSHash.hamming_dist(key, binary_hash) 315 | if distance < 2: 316 | candidates.update(table.get_list(key)) 317 | 318 | d_func = LSHash.euclidean_dist_square 319 | 320 | else: 321 | 322 | if distance_func == "euclidean": 323 | d_func = LSHash.euclidean_dist_square 324 | elif distance_func == "true_euclidean": 325 | d_func = LSHash.euclidean_dist 326 | elif distance_func == "centred_euclidean": 327 | d_func = LSHash.euclidean_dist_centred 328 | elif distance_func == "cosine": 329 | d_func = LSHash.cosine_dist 330 | elif distance_func == "l1norm": 331 | d_func = LSHash.l1norm_dist 332 | else: 333 | raise ValueError("The distance function name is invalid.") 334 | 335 | for i, table in enumerate(self.hash_tables): 336 | binary_hash = self._hash(self.uniform_planes[i], query_point) 337 | candidates.update(table.get_list(binary_hash)) 338 | 339 | # rank candidates by distance function 340 | candidates = [(ix, d_func(query_point, self._as_np_array(ix))) 341 | for ix in candidates] 342 | candidates = sorted(candidates, key=lambda x: x[1]) 343 | 344 | return candidates[:num_results] if num_results else candidates 345 | 346 | def get_hashes(self, input_point): 347 | """ Takes a single input point `input_point`, iterate through the 348 | uniform planes, and returns a list with size of `num_hashtables` 349 | containing the corresponding hash for each hashtable. 350 | 351 | :param input_point: 352 | A list, or tuple, or numpy ndarray object that contains numbers 353 | only. The dimension needs to be 1 * `input_dim`. 354 | """ 355 | 356 | hashes = [] 357 | for i, table in enumerate(self.hash_tables): 358 | binary_hash = self._hash(self.uniform_planes[i], input_point) 359 | hashes.append(binary_hash) 360 | 361 | return hashes 362 | 363 | ### distance functions 364 | 365 | @staticmethod 366 | def hamming_dist(bitarray1, bitarray2): 367 | xor_result = bitarray(bitarray1) ^ bitarray(bitarray2) 368 | return xor_result.count() 369 | 370 | @staticmethod 371 | def euclidean_dist(x, y): 372 | """ This is a hot function, hence some optimizations are made. """ 373 | diff = np.array(x) - y 374 | return np.sqrt(np.dot(diff, diff)) 375 | 376 | @staticmethod 377 | def euclidean_dist_square(x, y): 378 | """ This is a hot function, hence some optimizations are made. """ 379 | diff = np.array(x) - y 380 | return np.dot(diff, diff) 381 | 382 | @staticmethod 383 | def euclidean_dist_centred(x, y): 384 | """ This is a hot function, hence some optimizations are made. """ 385 | diff = np.mean(x) - np.mean(y) 386 | return np.dot(diff, diff) 387 | 388 | @staticmethod 389 | def l1norm_dist(x, y): 390 | return sum(abs(x - y)) 391 | 392 | @staticmethod 393 | def cosine_dist(x, y): 394 | return 1 - float(np.dot(x, y)) / ((np.dot(x, x) * np.dot(y, y)) ** 0.5) 395 | --------------------------------------------------------------------------------