├── CHANGES.rst
├── MANIFEST.in
├── hash.npz
├── weights.npz
├── lshashpy3
    ├── __init__.py
    ├── storage.py
    └── lshash.py
├── LICENSE
├── setup.py
├── .gitignore
├── example.py
└── README.rst


/CHANGES.rst:
--------------------------------------------------------------------------------
1 | :Version: 0.0.8
2 | :Python: 3.7.7


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.rst
2 | include CHANGES.rst
3 | include LICENSE


--------------------------------------------------------------------------------
/hash.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/loretoparisi/lshash/HEAD/hash.npz


--------------------------------------------------------------------------------
/weights.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/loretoparisi/lshash/HEAD/weights.npz


--------------------------------------------------------------------------------
/lshashpy3/__init__.py:
--------------------------------------------------------------------------------
1 | import pkg_resources
2 |  
3 | __version__ = '0.0.9'
4 | 
5 | from .lshash import *
6 | from .storage import *
7 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Loreto Parisi
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | try:
 4 |     from setuptools import setup
 5 | except ImportError:
 6 |     from distutils.core import setup
 7 | 
 8 | from os import path
 9 | this_directory = path.abspath(path.dirname(__file__))
10 | with open(path.join(this_directory, 'README.rst'), encoding='utf-8') as f:
11 |     readme = f.read()
12 | 
13 | with open(path.join(this_directory, 'CHANGES.rst'), encoding='utf-8') as f:
14 |     changes = f.read()
15 | 
16 | required = ['future', 'six', 'numpy', 'bitarray']
17 | 
18 | setup(
19 |     name='lshashpy3',
20 |     version='0.0.9',
21 |     packages=['lshashpy3'],
22 |     author='Kay Zhu',
23 |     author_email='me@kayzhu.com',
24 |     maintainer='Loreto Parisi',
25 |     maintainer_email='loretoparisi@gmail.com',
26 |     url="https://github.com/loretoparisi/lshash",
27 |     description='A fast Python 3 implementation of locality sensitive hashing with persistance support.',
28 |     long_description=readme + '\n\n' + changes,
29 |     long_description_content_type='text/x-rst',
30 |     license='MIT License',
31 |     install_requires=required,
32 |     classifiers=[
33 |         'Intended Audience :: Developers',
34 |         'License :: OSI Approved :: MIT License',
35 |         'Operating System :: OS Independent',
36 |         'Programming Language :: Python',
37 |         'Programming Language :: Python :: 3',
38 |         'Topic :: Software Development :: Libraries',
39 |         ]
40 | )
41 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/lshashpy3/storage.py:
--------------------------------------------------------------------------------
  1 | # lshash/storage.py
  2 | # Copyright 2012 Kay Zhu (a.k.a He Zhu) and contributors (see CONTRIBUTORS.txt)
  3 | #
  4 | # This module is part of lshash and is released under
  5 | # the MIT License: http://www.opensource.org/licenses/mit-license.php
  6 |  
  7 | import json
  8 |  
  9 | try:
 10 |     import redis
 11 | except ImportError:
 12 |     redis = None
 13 |  
 14 | __all__ = ['storage']
 15 |  
 16 |  
 17 | def storage(storage_config, index):
 18 |     """ Given the configuration for storage and the index, return the
 19 |     configured storage instance.
 20 |     """
 21 |     if 'dict' in storage_config:
 22 |         return InMemoryStorage(storage_config['dict'])
 23 |     elif 'redis' in storage_config:
 24 |         storage_config['redis']['db'] = index
 25 |         return RedisStorage(storage_config['redis'])
 26 |     else:
 27 |         raise ValueError("Only in-memory dictionary and Redis are supported.")
 28 |  
 29 |  
 30 | class BaseStorage(object):
 31 |     def __init__(self, config):
 32 |         """ An abstract class used as an adapter for storages. """
 33 |         raise NotImplementedError
 34 |  
 35 |     def keys(self):
 36 |         """ Returns a list of binary hashes that are used as dict keys. """
 37 |         raise NotImplementedError
 38 |  
 39 |     def set_val(self, key, val):
 40 |         """ Set `val` at `key`, note that the `val` must be a string. """
 41 |         raise NotImplementedError
 42 |  
 43 |     def get_val(self, key):
 44 |         """ Return `val` at `key`, note that the `val` must be a string. """
 45 |         raise NotImplementedError
 46 |  
 47 |     def append_val(self, key, val):
 48 |         """ Append `val` to the list stored at `key`.
 49 |  
 50 |         If the key is not yet present in storage, create a list with `val` at
 51 |         `key`.
 52 |         """
 53 |         raise NotImplementedError
 54 |  
 55 |     def get_list(self, key):
 56 |         """ Returns a list stored in storage at `key`.
 57 |  
 58 |         This method should return a list of values stored at `key`. `[]` should
 59 |         be returned if the list is empty or if `key` is not present in storage.
 60 |         """
 61 |         raise NotImplementedError
 62 |  
 63 |  
 64 | class InMemoryStorage(BaseStorage):
 65 |     def __init__(self, config):
 66 |         self.name = 'dict'
 67 |         self.storage = dict()
 68 |  
 69 |     def keys(self):
 70 |         return self.storage.keys()
 71 |  
 72 |     def set_val(self, key, val):
 73 |         self.storage[key] = val
 74 |  
 75 |     def get_val(self, key):
 76 |         return self.storage[key]
 77 |  
 78 |     def append_val(self, key, val):
 79 |         self.storage.setdefault(key, []).append(val)
 80 |  
 81 |     def get_list(self, key):
 82 |         return self.storage.get(key, [])
 83 |  
 84 |  
 85 | class RedisStorage(BaseStorage):
 86 |     def __init__(self, config):
 87 |         if not redis:
 88 |             raise ImportError("redis-py is required to use Redis as storage.")
 89 |         self.name = 'redis'
 90 |         self.storage = redis.StrictRedis(**config)
 91 |  
 92 |     def keys(self, pattern="*"):
 93 |         return self.storage.keys(pattern)
 94 |  
 95 |     def set_val(self, key, val):
 96 |         self.storage.set(key, val)
 97 |  
 98 |     def get_val(self, key):
 99 |         return self.storage.get(key)
100 |  
101 |     def append_val(self, key, val):
102 |         self.storage.rpush(key, json.dumps(val))
103 |  
104 |     def get_list(self, key):
105 |         return self.storage.lrange(key, 0, -1)


--------------------------------------------------------------------------------
/example.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from lshashpy3 import LSHash
 4 | from multiprocessing import cpu_count
 5 | 
 6 | # create 6-bit hashes for input data of 8 dimensions:
 7 | k = 6 # hash size
 8 | L = 5  # number of tables
 9 | d = 8 # Dimension of Feature vector
10 | lsh = LSHash(hash_size=k, input_dim=d, num_hashtables=L)
11 | 
12 | # index vector
13 | lsh.index([2,3,4,5,6,7,8,9])
14 | 
15 | # get the binary hash for an input point by iterating through all tables
16 | binary_hashes = lsh.get_hashes([2,3,4,5,6,7,8,9])
17 | print("hash representation", binary_hashes)
18 | 
19 | # checking that each table stores the same input vector with different keys
20 | for key, table in zip(binary_hashes, lsh.hash_tables):
21 |     print(key, table.get_list(key))
22 | 
23 | # index vector and extra data
24 | lsh.index([10,12,99,1,5,31,2,3], extra_data="vec1")
25 | lsh.index([10,11,94,1,4,31,2,3], extra_data="vec2")
26 | 
27 | # query a data point
28 | top_n = 1
29 | nn = lsh.query([1,2,3,4,5,6,7,7], num_results=top_n, distance_func="euclidean")
30 | print("query (euclidean):", nn)
31 | 
32 | # query a data point
33 | top_n = 1
34 | nn = lsh.query([1,2,3,4,5,6,7,7], num_results=top_n, distance_func="hamming")
35 | print("query (hamming):", nn)
36 | 
37 | # unpack vector, extra data and vectorial distance
38 | # distance_func can be "euclidean", "true_euclidean", "centred_euclidean", "cosine", "l1norm".
39 | top_n = 3
40 | nn = lsh.query([10,12,99,1,5,30,1,1], num_results=top_n, distance_func="euclidean")
41 | for ((vec,extra_data),distance) in nn:
42 |     print(vec, extra_data, distance)
43 | 
44 | # InMemoryStorage
45 | lsh = LSHash(hash_size=k, input_dim=d, num_hashtables=L,
46 |     storage_config={ 'dict': None }, matrices_filename='weights.npz', overwrite=False)
47 | 
48 | # local storage for numpy uniform random planes, overwrite matrix file
49 | lsh = LSHash(hash_size=k, input_dim=d, num_hashtables=L,
50 |     storage_config={ 'dict': None },
51 |     matrices_filename='weights.npz', 
52 |     hashtable_filename='hash.npz', 
53 |     overwrite=True)
54 | 
55 | lsh.index([10,12,99,1,5,31,2,3], extra_data="vec1")
56 | lsh.index([10,11,94,1,4,31,2,3], extra_data="vec2")
57 | 
58 | top_n = 3
59 | nn = lsh.query([10,12,99,1,5,30,1,1], num_results=top_n, distance_func="euclidean")
60 | print("query (euclidean):", nn)
61 | 
62 | # save hash table to disk 
63 | lsh.save()
64 | 
65 | # local storage for numpy uniform random planes, overwrite matrix file
66 | lsh = LSHash(hash_size=k, input_dim=d, num_hashtables=L,
67 |     storage_config={ 'dict': None },
68 |     matrices_filename='weights.npz', 
69 |     hashtable_filename='hash.npz', 
70 |     overwrite=True)
71 | 
72 | # execute a query loading hash table from local file system
73 | top_n = 3
74 | nn = lsh.query([10,12,99,1,5,30,1,1], num_results=top_n, distance_func="euclidean")
75 | print("query from disk (euclidean):", nn)
76 | 
77 | # Example: Index multiple items using multiprocessing
78 | input_points = [
79 |     [2, 3, 4, 5, 6, 7, 8, 9],
80 |     [10, 12, 99, 1, 5, 31, 2, 3],
81 |     [10, 11, 94, 1, 4, 31, 2, 3],
82 |     [1, 2, 3, 4, 5, 6, 7, 7],
83 |     [10, 12, 99, 1, 5, 30, 1, 1]
84 | ]
85 | extra_data_list = ["vec1", "vec2", "vec3", "vec4", "vec5"]
86 | 
87 | # Indexing in parallel
88 | print(f"Using {cpu_count()} CPU cores for multiprocessing...")
89 | lsh.index_batch(input_points, extra_data_list)
90 | 
91 | # Verify the indexed data
92 | for point, extra_data in zip(input_points, extra_data_list):
93 |     hashes = lsh.get_hashes(point)
94 |     print(f"Point: {point}, Extra Data: {extra_data}, Hashes: {hashes}")


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | LSHash
  2 | ======
  3 | 
  4 | :Version: 0.0.9
  5 | :Python: 3.11.5
  6 | 
  7 | A fast Python implementation of locality sensitive hashing with persistance
  8 | support.
  9 | 
 10 | Based on original source code https://github.com/kayzhu/LSHash
 11 | 
 12 | Highlights
 13 | ==========
 14 | 
 15 | - Python3 support
 16 | - Load & save hash tables to local disk
 17 | - Fast hash calculation for large amount of high dimensional data through the use of `numpy` arrays.
 18 | - Built-in support for persistency through Redis.
 19 | - Multiple hash indexes support.
 20 | - Built-in support for common distance/objective functions for ranking outputs.
 21 | 
 22 | Installation
 23 | ============
 24 | ``LSHash`` depends on the following libraries:
 25 | 
 26 | - numpy
 27 | - bitarray (if hamming distance is used as distance function)
 28 | 
 29 | Optional
 30 | - redis (if persistency through Redis is needed)
 31 | 
 32 | To install from sources:
 33 | 
 34 | .. code-block:: bash
 35 | 
 36 |     $ git clone https://github.com/loretoparisi/lshash.git
 37 |     $ python setup.py install
 38 |     
 39 | To install from PyPI:
 40 | 
 41 | .. code-block:: bash
 42 | 
 43 |     $ pip install lshashpy3
 44 |     $ python -c "import lshashpy3 as lshash; print(lshash.__version__);"
 45 | 
 46 | Quickstart
 47 | ==========
 48 | To create 6-bit hashes for input data of 8 dimensions:
 49 | 
 50 | .. code-block:: python
 51 | 
 52 |  # create 6-bit hashes for input data of 8 dimensions:
 53 |  lsh = LSHash(6, 8)
 54 |  
 55 |  # index vector
 56 |  lsh.index([2,3,4,5,6,7,8,9])
 57 | 
 58 |  # index vector and extra data
 59 |  lsh.index([10,12,99,1,5,31,2,3], extra_data="vec1")
 60 |  lsh.index([10,11,94,1,4,31,2,3], extra_data="vec2")
 61 | 
 62 |  # query a data point
 63 |  top_n = 1
 64 |  nn = lsh.query([1,2,3,4,5,6,7,7], num_results=top_n, distance_func="euclidean")
 65 |  print(nn)
 66 | 
 67 |  # unpack vector, extra data and vectorial distance
 68 |  top_n = 3
 69 |  nn = lsh.query([10,12,99,1,5,30,1,1], num_results=top_n, distance_func="euclidean")
 70 |  for ((vec,extra_data),distance) in nn:
 71 |      print(vec, extra_data, distance)
 72 |         
 73 |         
 74 | To save hash table to disk:
 75 | 
 76 | .. code-block:: python
 77 | 
 78 |  lsh = LSHash(hash_size=k, input_dim=d, num_hashtables=L,
 79 |      storage_config={ 'dict': None },
 80 |      matrices_filename='weights.npz', 
 81 |      hashtable_filename='hash.npz', 
 82 |      overwrite=True)
 83 | 
 84 |  lsh.index([10,12,99,1,5,31,2,3], extra_data="vec1")
 85 |  lsh.index([10,11,94,1,4,31,2,3], extra_data="vec2")
 86 |  lsh.save()
 87 | 
 88 | To load hash table from disk and perform a query:
 89 | 
 90 | .. code-block:: python
 91 | 
 92 |  lsh = LSHash(hash_size=k, input_dim=d, num_hashtables=L,
 93 |      storage_config={ 'dict': None },
 94 |      matrices_filename='weights.npz', 
 95 |      hashtable_filename='hash.npz', 
 96 |      overwrite=False)
 97 | 
 98 |  top_n = 3
 99 |  nn = lsh.query([10,12,99,1,5,30,1,1], num_results=top_n, distance_func="euclidean")
100 |  print(nn)
101 | 
102 | New Feature: Multiprocessing Support
103 | ===================================
104 | 
105 | The library now supports indexing multiple items in parallel using the `index_batch` method. This feature leverages Python's `multiprocessing` module to speed up the indexing process for large datasets.
106 | 
107 | Example: Using Multiprocessing for Batch Indexing
108 | --------------------------------------------------
109 | 
110 | To index multiple items in parallel:
111 | 
112 | .. code-block:: python
113 | 
114 |     from lshashpy3 import LSHash
115 | 
116 |     # Create an LSHash instance
117 |     lsh = LSHash(hash_size=6, input_dim=8, num_hashtables=5)
118 | 
119 |     # Define input points and optional extra data
120 |     input_points = [
121 |         [2, 3, 4, 5, 6, 7, 8, 9],
122 |         [10, 12, 99, 1, 5, 31, 2, 3],
123 |         [10, 11, 94, 1, 4, 31, 2, 3],
124 |         [1, 2, 3, 4, 5, 6, 7, 7],
125 |         [10, 12, 99, 1, 5, 30, 1, 1]
126 |     ]
127 |     extra_data_list = ["vec1", "vec2", "vec3", "vec4", "vec5"]
128 | 
129 |     # Index the points in parallel
130 |     lsh.index_batch(input_points, extra_data_list)
131 | 
132 |     # Verify the indexed data
133 |     for point, extra_data in zip(input_points, extra_data_list):
134 |         hashes = lsh.get_hashes(point)
135 |         print(f"Point: {point}, Extra Data: {extra_data}, Hashes: {hashes}")
136 | 
137 | API
138 | ==============
139 | 
140 | - To initialize a ``LSHash`` instance:
141 | 
142 | .. code-block:: python
143 | 
144 |  k = 6 # hash size
145 |  L = 5  # number of tables
146 |  d = 8 # Dimension of Feature vector
147 |  LSHash(hash_size=k, input_dim=d, num_hashtables=L,
148 |     storage_config={ 'dict': None },
149 |     matrices_filename='weights.npz', 
150 |     hashtable_filename='hash.npz', 
151 |     overwrite=True)
152 | 
153 | parameters:
154 | 
155 | ``hash_size``:
156 |     The length of the resulting binary hash.
157 | ``input_dim``:
158 |     The dimension of the input vector.
159 | ``num_hashtables = 1``:
160 |     (optional) The number of hash tables used for multiple lookups.
161 | ``storage = None``:
162 |     (optional) Specify the name of the storage to be used for the index
163 |     storage. Options include "redis".
164 | ``matrices_filename = None``:
165 |     (optional) Specify the path to the .npz file random matrices are stored
166 |     or to be stored if the file does not exist yet
167 | ``hashtable_filename = None``:
168 |     (optional) Specify the path to the .npz file hash table are stored
169 |     or to be stored if the file does not exist yet
170 | ``overwrite = False``:
171 |     (optional) Whether to overwrite the matrices file if it already exist
172 | 
173 | - To index a data point of a given ``LSHash`` instance, e.g., ``lsh``:
174 | 
175 | .. code-block:: python
176 | 
177 |     lsh.index(input_point, extra_data=None):
178 | 
179 | parameters:
180 | 
181 | ``input_point``:
182 |     The input data point is an array or tuple of numbers of input_dim.
183 | ``extra_data = None``:
184 |     (optional) Extra data to be added along with the input_point.
185 | 
186 | - To query a data point against a given ``LSHash`` instance, e.g., ``lsh``:
187 | 
188 | .. code-block:: python
189 | 
190 |     lsh.query(query_point, num_results=None, distance_func="euclidean"):
191 | 
192 | parameters:
193 | 
194 | ``query_point``:
195 |     The query data point is an array or tuple of numbers of input_dim.
196 | ``num_results = None``:
197 |     (optional) The number of query results to return in ranked order. By
198 |     default all results will be returned.
199 | ``distance_func = "euclidean"``:
200 |     (optional) Distance function to use to rank the candidates. By default
201 |     "euclidean" distance function will be used. Distance function can be 
202 |     "euclidean", "true_euclidean", "centred_euclidean", "cosine", "l1norm".
203 |     
204 | 
205 | - To save the hash table currently indexed:
206 | 
207 | .. code-block:: python
208 | 
209 |     lsh.save():
210 | 


--------------------------------------------------------------------------------
/lshashpy3/lshash.py:
--------------------------------------------------------------------------------
  1 | # lshash/lshash.py
  2 | # Copyright 2012 Kay Zhu (a.k.a He Zhu) and contributors (see CONTRIBUTORS.txt)
  3 | #
  4 | # This module is part of lshash and is released under
  5 | # the MIT License: http://www.opensource.org/licenses/mit-license.php
  6 | # -*- coding: utf-8 -*-
  7 | from __future__ import print_function, unicode_literals, division, absolute_import
  8 | from builtins import int, round, str,  object  # noqa
  9 | 
 10 | try:
 11 |   basestring
 12 | except NameError:
 13 |   basestring = str
 14 | 
 15 | #import future        # noqa
 16 | import builtins      # noqa
 17 | #import past          # noqa
 18 | import six           # noqa
 19 | 
 20 | import os
 21 | import json
 22 | import numpy as np
 23 | from multiprocessing import Pool, cpu_count
 24 | 
 25 | try:
 26 |     from storage import storage  # py2
 27 | except ImportError:
 28 |     from .storage import storage  # py3
 29 | 
 30 | try:
 31 |     from bitarray import bitarray
 32 | except ImportError:
 33 |     bitarray = None
 34 | 
 35 | try:
 36 |     xrange  # py2
 37 | except NameError:
 38 |     xrange = range  # py3
 39 | 
 40 | 
 41 | class LSHash(object):
 42 |     """ LSHash implments locality sensitive hashing using random projection for
 43 |     input vectors of dimension `input_dim`.
 44 | 
 45 |     Attributes:
 46 | 
 47 |     :param hash_size:
 48 |         The length of the resulting binary hash in integer. E.g., 32 means the
 49 |         resulting binary hash will be 32-bit long.
 50 |     :param input_dim:
 51 |         The dimension of the input vector. E.g., a grey-scale picture of 30x30
 52 |         pixels will have an input dimension of 900.
 53 |     :param num_hashtables:
 54 |         (optional) The number of hash tables used for multiple lookups.
 55 |     :param storage_config:
 56 |         (optional) A dictionary of the form `{backend_name: config}` where
 57 |         `backend_name` is the either `dict` or `redis`, and `config` is the
 58 |         configuration used by the backend. For `redis` it should be in the
 59 |         format of `{"redis": {"host": hostname, "port": port_num}}`, where
 60 |         `hostname` is normally `localhost` and `port` is normally 6379.
 61 |     :param matrices_filename:
 62 |         (optional) Specify the path to the compressed numpy file ending with
 63 |         extension `.npz`, where the uniform random planes are stored, or to be
 64 |         stored if the file does not exist yet.
 65 |     :param overwrite:
 66 |         (optional) Whether to overwrite the matrices file if it already exist
 67 |     """
 68 | 
 69 |     def __init__(self, hash_size, input_dim, num_hashtables=1,
 70 |                  storage_config=None, matrices_filename=None, hashtable_filename=None, overwrite=False):
 71 | 
 72 |         self.hash_size = hash_size
 73 |         self.input_dim = input_dim
 74 |         self.num_hashtables = num_hashtables
 75 | 
 76 |         if storage_config is None:
 77 |             storage_config = {'dict': None}
 78 |         self.storage_config = storage_config
 79 | 
 80 |         if matrices_filename and not matrices_filename.endswith('.npz'):
 81 |             raise ValueError("The specified file name must end with .npz")
 82 |         self.matrices_filename = matrices_filename
 83 | 
 84 |         if hashtable_filename and not hashtable_filename.endswith('.npz'):
 85 |             raise ValueError("The specified file name must end with .npz")
 86 |         self.hashtable_filename = hashtable_filename
 87 | 
 88 |         self.overwrite = overwrite
 89 | 
 90 |         self._init_uniform_planes()
 91 |         self._init_hashtables()
 92 | 
 93 |     def _init_uniform_planes(self):
 94 |         """ Initialize uniform planes used to calculate the hashes
 95 | 
 96 |         if file `self.matrices_filename` exist and `self.overwrite` is
 97 |         selected, save the uniform planes to the specified file.
 98 | 
 99 |         if file `self.matrices_filename` exist and `self.overwrite` is not
100 |         selected, load the matrix with `np.load`.
101 | 
102 |         if file `self.matrices_filename` does not exist and regardless of
103 |         `self.overwrite`, only set `self.uniform_planes`.
104 |         """
105 | 
106 |         if "uniform_planes" in self.__dict__:
107 |             return
108 | 
109 |         if self.matrices_filename:
110 |             file_exist = os.path.isfile(self.matrices_filename)
111 |             if file_exist and not self.overwrite:
112 |                 try:
113 |                     npzfiles = np.load(self.matrices_filename)
114 |                 except IOError:
115 |                     print("Cannot load specified file as a numpy array")
116 |                     raise
117 |                 else:
118 |                     npzfiles = sorted(npzfiles.items(), key=lambda x: x[0])
119 |                     self.uniform_planes = [t[1] for t in npzfiles]
120 |             else:
121 |                 self.uniform_planes = [self._generate_uniform_planes()
122 |                                        for _ in xrange(self.num_hashtables)]
123 |                 try:
124 |                     np.savez_compressed(self.matrices_filename,
125 |                                         *self.uniform_planes)
126 |                 except IOError:
127 |                     print("IOError when saving matrices to specificed path")
128 |                     raise
129 |         else:
130 |             self.uniform_planes = [self._generate_uniform_planes()
131 |                                    for _ in xrange(self.num_hashtables)]
132 | 
133 |     def _init_hashtables(self):
134 |         """ Initialize the hash tables such that each record will be in the
135 |         form of "[storage1, storage2, ...]" """
136 | 
137 |         if self.hashtable_filename:
138 |             file_exist = os.path.isfile(self.hashtable_filename)
139 |             if file_exist:
140 |                 try:
141 |                     npzfiles = np.load(self.hashtable_filename, allow_pickle=True)
142 |                     self.hash_tables = npzfiles['data']
143 |                 except IOError:
144 |                     print("Cannot load specified file as a numpy array")
145 |                     raise
146 |             else:
147 |                 self.hash_tables = [storage(self.storage_config, i)
148 |                                 for i in xrange(self.num_hashtables)]
149 |         else:
150 |             self.hash_tables = [storage(self.storage_config, i)
151 |                                 for i in xrange(self.num_hashtables)]
152 | 
153 |     def _generate_uniform_planes(self):
154 |         """ Generate uniformly distributed hyperplanes and return it as a 2D
155 |         numpy array.
156 |         """
157 | 
158 |         return np.random.randn(self.hash_size, self.input_dim)
159 | 
160 |     def _hash(self, planes, input_point):
161 |         """ Generates the binary hash for `input_point` and returns it.
162 | 
163 |         :param planes:
164 |             The planes are random uniform planes with a dimension of
165 |             `hash_size` * `input_dim`.
166 |         :param input_point:
167 |             A Python tuple or list object that contains only numbers.
168 |             The dimension needs to be 1 * `input_dim`.
169 |         """
170 | 
171 |         try:
172 |             input_point = np.array(input_point)  # for faster dot product
173 |             projections = np.dot(planes, input_point)
174 |         except TypeError as e:
175 |             print("""The input point needs to be an array-like object with
176 |                   numbers only elements""")
177 |             raise
178 |         except ValueError as e:
179 |             print("""The input point needs to be of the same dimension as
180 |                   `input_dim` when initializing this LSHash instance""", e)
181 |             raise
182 |         else:
183 |             return "".join(['1' if i > 0 else '0' for i in projections])
184 | 
185 |     def _as_np_array(self, json_or_tuple):
186 |         """ Takes either a JSON-serialized data structure or a tuple that has
187 |         the original input points stored, and returns the original input point
188 |         in numpy array format.
189 |         """
190 |         if isinstance(json_or_tuple, basestring):
191 |             # JSON-serialized in the case of Redis
192 |             try:
193 |                 # Return the point stored as list, without the extra data
194 |                 tuples = json.loads(json_or_tuple)[0]
195 |             except TypeError:
196 |                 print("The value stored is not JSON-serilizable")
197 |                 raise
198 |         else:
199 |             # If extra_data exists, `tuples` is the entire
200 |             # (point:tuple, extra_data). Otherwise (i.e., extra_data=None),
201 |             # return the point stored as a tuple
202 |             tuples = json_or_tuple
203 | 
204 |         if isinstance(tuples[0], tuple):
205 |             # in this case extra data exists
206 |             return np.asarray(tuples[0])
207 | 
208 |         elif isinstance(tuples, (tuple, list)):
209 |             try:
210 |                 return np.asarray(tuples)
211 |             except ValueError as e:
212 |                 print("The input needs to be an array-like object", e)
213 |                 raise
214 |         else:
215 |             raise TypeError("query data is not supported")
216 | 
217 |     def index(self, input_point, extra_data=None):
218 |         """ Index a single input point by adding it to the selected storage.
219 | 
220 |         If `extra_data` is provided, it will become the value of the dictionary
221 |         {input_point: extra_data}, which in turn will become the value of the
222 |         hash table. `extra_data` needs to be JSON serializable if in-memory
223 |         dict is not used as storage.
224 | 
225 |         :param input_point:
226 |             A list, or tuple, or numpy ndarray object that contains numbers
227 |             only. The dimension needs to be 1 * `input_dim`.
228 |             This object will be converted to Python tuple and stored in the
229 |             selected storage.
230 |         :param extra_data:
231 |             (optional) Needs to be a JSON-serializable object: list, dicts and
232 |             basic types such as strings and integers.
233 |         """
234 | 
235 |         if isinstance(input_point, np.ndarray):
236 |             input_point = input_point.tolist()
237 | 
238 |         if extra_data is not None:
239 |             value = (tuple(input_point), extra_data)
240 |         else:
241 |             # LP: extra_data to None to keep output consistency
242 |             value = (tuple(input_point), None)
243 | 
244 |         hashes = []
245 |         for i, table in enumerate(self.hash_tables):
246 |             h = self._hash(self.uniform_planes[i], input_point)
247 |             table.append_val(h, value)
248 |             hashes.append(h)
249 |         return hashes
250 | 
251 |     def index_batch(self, input_points, extra_data_list=None):
252 |         """
253 |         Index multiple input points in parallel using multiprocessing.
254 | 
255 |         :param input_points: List of input points to index.
256 |         :param extra_data_list: (optional) List of extra data corresponding to each input point.
257 |         """
258 |         if extra_data_list is None:
259 |             extra_data_list = [None] * len(input_points)
260 | 
261 |         if len(input_points) != len(extra_data_list):
262 |             raise ValueError("input_points and extra_data_list must have the same length")
263 | 
264 |         # Prepare data for multiprocessing
265 |         data = list(zip(input_points, extra_data_list))
266 | 
267 |         # Use multiprocessing Pool to index data in parallel
268 |         with Pool(cpu_count()) as pool:
269 |             pool.starmap(self.index, data)
270 | 
271 |     def save(self):
272 |         """
273 |             Save save the uniform planes to the specified file.
274 |         """
275 |         if self.hashtable_filename:
276 |             try:
277 |                 np.savez_compressed(self.hashtable_filename, allow_pickle=True, data=self.hash_tables)
278 | 
279 |             except IOError:
280 |                 print("IOError when saving hash tables to specificed path")
281 |                 raise
282 | 
283 |     def query(self, query_point, num_results=None, distance_func=None):
284 |         """ Takes `query_point` which is either a tuple or a list of numbers,
285 |         returns `num_results` of results as a list of tuples that are ranked
286 |         based on the supplied metric function `distance_func`.
287 | 
288 |         :param query_point:
289 |             A list, or tuple, or numpy ndarray that only contains numbers.
290 |             The dimension needs to be 1 * `input_dim`.
291 |             Used by :meth:`._hash`.
292 |         :param num_results:
293 |             (optional) Integer, specifies the max amount of results to be
294 |             returned. If not specified all candidates will be returned as a
295 |             list in ranked order.
296 |         :param distance_func:
297 |             (optional) The distance function to be used. Currently it needs to
298 |             be one of ("hamming", "euclidean", "true_euclidean",
299 |             "centred_euclidean", "cosine", "l1norm"). By default "euclidean"
300 |             will used.
301 |         """
302 | 
303 |         candidates = set()
304 |         if not distance_func:
305 |             distance_func = "euclidean"
306 | 
307 |         if distance_func == "hamming":
308 |             if not bitarray:
309 |                 raise ImportError(" Bitarray is required for hamming distance")
310 | 
311 |             for i, table in enumerate(self.hash_tables):
312 |                 binary_hash = self._hash(self.uniform_planes[i], query_point)
313 |                 for key in table.keys():
314 |                     distance = LSHash.hamming_dist(key, binary_hash)
315 |                     if distance < 2:
316 |                         candidates.update(table.get_list(key))
317 | 
318 |             d_func = LSHash.euclidean_dist_square
319 | 
320 |         else:
321 | 
322 |             if distance_func == "euclidean":
323 |                 d_func = LSHash.euclidean_dist_square
324 |             elif distance_func == "true_euclidean":
325 |                 d_func = LSHash.euclidean_dist
326 |             elif distance_func == "centred_euclidean":
327 |                 d_func = LSHash.euclidean_dist_centred
328 |             elif distance_func == "cosine":
329 |                 d_func = LSHash.cosine_dist
330 |             elif distance_func == "l1norm":
331 |                 d_func = LSHash.l1norm_dist
332 |             else:
333 |                 raise ValueError("The distance function name is invalid.")
334 | 
335 |             for i, table in enumerate(self.hash_tables):
336 |                 binary_hash = self._hash(self.uniform_planes[i], query_point)
337 |                 candidates.update(table.get_list(binary_hash))
338 | 
339 |         # rank candidates by distance function
340 |         candidates = [(ix, d_func(query_point, self._as_np_array(ix)))
341 |                       for ix in candidates]
342 |         candidates = sorted(candidates, key=lambda x: x[1])
343 | 
344 |         return candidates[:num_results] if num_results else candidates
345 | 
346 |     def get_hashes(self, input_point):
347 |         """ Takes a single input point `input_point`, iterate through the
348 |         uniform planes, and returns a list with size of `num_hashtables`
349 |         containing the corresponding hash for each hashtable.
350 | 
351 |         :param input_point:
352 |             A list, or tuple, or numpy ndarray object that contains numbers
353 |             only. The dimension needs to be 1 * `input_dim`.
354 |         """
355 |         
356 |         hashes = []
357 |         for i, table in enumerate(self.hash_tables):
358 |             binary_hash = self._hash(self.uniform_planes[i], input_point)
359 |             hashes.append(binary_hash)
360 |         
361 |         return hashes
362 | 
363 |     ### distance functions
364 | 
365 |     @staticmethod
366 |     def hamming_dist(bitarray1, bitarray2):
367 |         xor_result = bitarray(bitarray1) ^ bitarray(bitarray2)
368 |         return xor_result.count()
369 | 
370 |     @staticmethod
371 |     def euclidean_dist(x, y):
372 |         """ This is a hot function, hence some optimizations are made. """
373 |         diff = np.array(x) - y
374 |         return np.sqrt(np.dot(diff, diff))
375 | 
376 |     @staticmethod
377 |     def euclidean_dist_square(x, y):
378 |         """ This is a hot function, hence some optimizations are made. """
379 |         diff = np.array(x) - y
380 |         return np.dot(diff, diff)
381 | 
382 |     @staticmethod
383 |     def euclidean_dist_centred(x, y):
384 |         """ This is a hot function, hence some optimizations are made. """
385 |         diff = np.mean(x) - np.mean(y)
386 |         return np.dot(diff, diff)
387 | 
388 |     @staticmethod
389 |     def l1norm_dist(x, y):
390 |         return sum(abs(x - y))
391 | 
392 |     @staticmethod
393 |     def cosine_dist(x, y):
394 |         return 1 - float(np.dot(x, y)) / ((np.dot(x, x) * np.dot(y, y)) ** 0.5)
395 | 


--------------------------------------------------------------------------------