├── .github
    └── workflows
    │   └── test.yml
├── LICENSE
├── README.md
├── embedb
    ├── __init__.py
    ├── embedb.py
    └── test_embedb.py
├── requirements.txt
└── setup.py


/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 3 | 
 4 | name: Python package
 5 | 
 6 | on: push
 7 | 
 8 | jobs:
 9 |   test:
10 | 
11 |     runs-on: ubuntu-latest
12 |     strategy:
13 |       fail-fast: true
14 |       matrix:
15 |         python-version: ["3.7", "3.8", "3.9"]
16 | 
17 |     steps:
18 |     - uses: actions/checkout@v2
19 |     - name: Set up Python ${{ matrix.python-version }}
20 |       uses: actions/setup-python@v2
21 |       with:
22 |         python-version: ${{ matrix.python-version }}
23 |     - name: Install dependencies
24 |       run: |
25 |         python -m pip install --upgrade pip
26 |         python -m pip install flake8 pytest
27 |         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
28 |         if [ -f requirements-test.txt ]; then pip install -r requirements-test.txt; fi
29 |     - name: Lint with flake8
30 |       run: |
31 |         # stop the build if there are Python syntax errors or undefined names
32 |         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
33 |         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
34 |         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
35 |     - name: Test with pytest
36 |       run: |
37 |         pytest


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Ntropy
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # EmbeDB
 2 | EmbeDB is a small Python wrapper around [LMDB](https://lmdb.readthedocs.io/) built as key-value storage for embeddings.
 3 | 
 4 | Installation: 
 5 | `pip install git+https://github.com/ntropy-network/embedb`
 6 | 
 7 | ## Usage
 8 | 
 9 | ```python
10 | from embedb import EmbeDB
11 | import numpy as np
12 | 
13 | size = 1000000
14 | vectors = np.random.rand(size, 512).astype('float32')
15 | keys = [str(i) for i in range(size)]
16 | 
17 | d = {k: v for k, v in zip(keys, vectors)}
18 | db = EmbeDB("/tmp/test.db", readonly=False)
19 | db.update(d)  # faster version of iterations on db[k] = v
20 | 
21 | subset = np.random.choice(keys, 10000)
22 | subset_vectors = db[subset]  # faster version of [d[k] for k in subset]
23 | ```
24 | 
25 | Basic benchmark:
26 | 
27 | ```python 
28 | In [2]: !ls -lha /tmp/test.db
29 | -rw-r--r--  1 Arseny  wheel   2.8G Nov  4 18:39 /tmp/test.db
30 | 
31 | In [3]: %timeit _ = db[subset]
32 | 21.2 ms ± 598 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
33 | 
34 | In [4]: %timeit _ = [d[k] for k in subset]
35 | 2.57 ms ± 36 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
36 | ```
37 | 


--------------------------------------------------------------------------------
/embedb/__init__.py:
--------------------------------------------------------------------------------
1 | from .embedb import EmbeDB
2 | 
3 | __version__ = "0.1.0"
4 | 


--------------------------------------------------------------------------------
/embedb/embedb.py:
--------------------------------------------------------------------------------
  1 | import io
  2 | import os.path
  3 | from typing import List, Sequence, Union
  4 | 
  5 | import lmdb
  6 | import numpy as np
  7 | 
  8 | 
  9 | def bytes_to_vector(x: bytes) -> np.ndarray:
 10 |     return np.frombuffer(x, dtype=np.float32)
 11 | 
 12 | 
 13 | def vector_to_bytes(x: np.ndarray) -> bytes:
 14 |     return x.tobytes()
 15 | 
 16 | 
 17 | def ndarray_to_bytes(arr: np.ndarray) -> bytes:
 18 |     buffer = io.BytesIO()
 19 |     np.save(buffer, arr)
 20 |     buffer.seek(0)
 21 |     return buffer.read()
 22 | 
 23 | 
 24 | def bytes_to_ndarray(b: bytes) -> np.ndarray:
 25 |     buffer = io.BytesIO(b)
 26 |     buffer.seek(0)
 27 |     return np.load(buffer)
 28 | 
 29 | 
 30 | class EmbeDB:
 31 |     """
 32 |     A simple file-based key-value store for embeddings. Powered by LMDB.
 33 |     """
 34 | 
 35 |     def __init__(
 36 |         self,
 37 |         path: str,
 38 |         readonly=True,
 39 |         size=None,
 40 |         float_vector_only=True,
 41 |         encoder=None,
 42 |         decoder=None,
 43 |     ):
 44 |         """
 45 |         :param path: Path to the database.
 46 |         :param readonly: Whether the database is read-only.
 47 |         :param size: Maximum size of the database in bytes.
 48 |         :param float_vector_only: When True, only 1D float32 array are allowed (speed optimized).
 49 |         :param encoder: custom encoder function with a signature (obj => bytes).
 50 |         :param decoder: custom decoder function with a signature (bytes => obj).
 51 |         """
 52 |         size = size or 10**12  # approx 1 TB
 53 |         self.path = path
 54 | 
 55 |         db_dir = os.path.dirname(path)
 56 |         os.makedirs(db_dir, exist_ok=True)
 57 | 
 58 |         self.env = lmdb.Environment(
 59 |             path=path, map_size=size, lock=False, readonly=readonly, subdir=False
 60 |         )
 61 |         self.encoder = (
 62 |             encoder or vector_to_bytes if float_vector_only else ndarray_to_bytes
 63 |         )
 64 |         self.decoder = (
 65 |             decoder or bytes_to_vector if float_vector_only else bytes_to_ndarray
 66 |         )
 67 | 
 68 |     def __getitem__(
 69 |         self, item: Union[Sequence[str], str]
 70 |     ) -> Union[np.ndarray, List[np.ndarray]]:
 71 |         with self.env.begin(write=False) as txn:
 72 |             if isinstance(item, str):
 73 |                 res = txn.get(item.encode())
 74 |                 return self.decoder(res) if res else None
 75 | 
 76 |             if isinstance(item, (list, tuple, np.ndarray)):
 77 |                 cur = txn.cursor()
 78 |                 res = [
 79 |                     self.decoder(v) if v else None
 80 |                     for _, v in cur.getmulti([x.encode() for x in item])
 81 |                 ]
 82 |                 cur.close()
 83 |                 return res
 84 | 
 85 |         raise ValueError(f"Invalid type for item: {type(item)} {item}")
 86 | 
 87 |     def __setitem__(self, key: str, value: np.ndarray):
 88 |         with self.env.begin(write=True) as txn:
 89 |             txn.put(key.encode(), self.encoder(value))
 90 | 
 91 |     def __contains__(self, item):
 92 |         return self[item] is not None
 93 | 
 94 |     def get(self, key: str) -> np.ndarray:
 95 |         return self[key]
 96 | 
 97 |     def keys(self):
 98 |         with self.env.begin() as txn:
 99 |             cur = txn.cursor()
100 |             for k, _ in cur:
101 |                 yield k.decode()
102 |             cur.close()
103 | 
104 |     def values(self):
105 |         with self.env.begin() as txn:
106 |             cur = txn.cursor()
107 |             for _, v in cur:
108 |                 yield self.decoder(v)
109 |             cur.close()
110 | 
111 |     def items(self):
112 |         with self.env.begin() as txn:
113 |             cur = txn.cursor()
114 |             for k, v in cur:
115 |                 yield k.decode(), self.decoder(v)
116 |             cur.close()
117 | 
118 |     def update(self, other):
119 |         with self.env.begin(write=True) as txn:
120 |             cur = txn.cursor()
121 |             pairs = [(k.encode(), self.encoder(v)) for k, v in other.items()]
122 |             cur.putmulti(pairs)
123 |             cur.close()
124 | 
125 |     def __len__(self):
126 |         with self.env.begin() as txn:
127 |             length = txn.stat()["entries"]
128 |         return length
129 | 
130 |     def close(self):
131 |         self.env.close()
132 | 
133 |     def __del__(self):
134 |         self.close()
135 | 
136 |     def delete(self, item: Union[Sequence[str], str]):
137 |         with self.env.begin(write=True) as txn:
138 |             if isinstance(item, str):
139 |                 return txn.delete(item.encode())
140 | 
141 |             if isinstance(item, (list, tuple, np.ndarray)):
142 |                 return [txn.delete(x.encode()) for x in item]
143 | 


--------------------------------------------------------------------------------
/embedb/test_embedb.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import shutil
  3 | import tempfile
  4 | 
  5 | import lmdb
  6 | import numpy as np
  7 | import pytest
  8 | 
  9 | from embedb import EmbeDB
 10 | 
 11 | 
 12 | @pytest.fixture
 13 | def db_path():
 14 |     d = tempfile.mkdtemp()
 15 |     fname = os.path.join(d, "test.rdb")
 16 |     db = EmbeDB(path=fname, readonly=False)
 17 | 
 18 |     db["a"] = np.array([1, 2, 3]).astype(np.float32)
 19 |     db["b"] = np.array([4, 5, 6]).astype(np.float32)
 20 |     assert len(db) == 2
 21 |     db.close()
 22 | 
 23 |     yield fname
 24 | 
 25 |     shutil.rmtree(d)
 26 | 
 27 | 
 28 | def test_embedb_getitem_contains(db_path):
 29 |     db = EmbeDB(db_path)
 30 | 
 31 |     assert "a" in db
 32 |     assert "b" in db
 33 |     assert db["a"].tolist() == [1, 2, 3]
 34 |     assert db["b"].tolist() == [4, 5, 6]
 35 |     db.close()
 36 | 
 37 | 
 38 | def test_embedb_setitem(db_path):
 39 |     db = EmbeDB(db_path, readonly=False)
 40 |     db["c"] = np.array([7, 8, 9]).astype(np.float32)
 41 |     assert db["c"].tolist() == [7, 8, 9]
 42 |     db.close()
 43 | 
 44 | 
 45 | def test_embedb_batch_getitem(db_path):
 46 |     db = EmbeDB(db_path)
 47 | 
 48 |     a, b = db[["a", "b"]]
 49 |     assert a.tolist() == [1, 2, 3]
 50 |     assert b.tolist() == [4, 5, 6]
 51 | 
 52 |     a, b = db[np.array(["a", "b"])]
 53 |     assert a.tolist() == [1, 2, 3]
 54 |     assert b.tolist() == [4, 5, 6]
 55 |     db.close()
 56 | 
 57 | 
 58 | def test_embedb_update(db_path):
 59 |     db = EmbeDB(db_path, readonly=False)
 60 |     pairs = {
 61 |         "a": np.array([7, 8, 9]).astype(np.float32),
 62 |         "b": np.array([10, 11, 12]).astype(np.float32),
 63 |     }
 64 |     db.update(pairs)
 65 |     db.close()
 66 | 
 67 |     db = EmbeDB(db_path)
 68 |     assert db["a"].tolist() == [7, 8, 9]
 69 |     assert db["b"].tolist() == [10, 11, 12]
 70 | 
 71 | 
 72 | def test_embedb_iter(db_path):
 73 |     db = EmbeDB(db_path)
 74 |     for k, v in db.items():
 75 |         assert k in ["a", "b"]
 76 |         assert v.tolist() in [[1, 2, 3], [4, 5, 6]]
 77 | 
 78 | 
 79 | def test_embedb_keys(db_path):
 80 |     db = EmbeDB(db_path)
 81 |     assert set(db.keys()) == {"a", "b"}
 82 | 
 83 | 
 84 | def test_embedb_values(db_path):
 85 |     db = EmbeDB(db_path)
 86 |     assert [x.tolist() for x in db.values()] == [[1, 2, 3], [4, 5, 6]]
 87 | 
 88 | 
 89 | def test_multidim_embedb(db_path):
 90 |     db = EmbeDB(db_path, readonly=False, float_vector_only=False)
 91 |     db["c"] = np.array([[1, 2], [3, 4]])
 92 |     assert db["c"].tolist() == [[1, 2], [3, 4]]
 93 |     db.close()
 94 | 
 95 | 
 96 | def test_db_close(db_path):
 97 |     db = EmbeDB(db_path)
 98 |     db.close()
 99 |     with pytest.raises(lmdb.Error):
100 |         _ = db["a"]
101 | 
102 | 
103 | def test_db_contains(db_path):
104 |     db = EmbeDB(db_path)
105 |     assert "a" in db
106 |     assert "b" in db
107 |     assert "c" not in db
108 |     assert db["c"] is None
109 |     assert db.get("c") is None
110 |     db.close()
111 | 
112 | 
113 | def test_db_delete_single(db_path):
114 |     db = EmbeDB(db_path, readonly=False)
115 |     db.delete("a")
116 |     assert "a" not in db
117 |     assert "b" in db
118 |     db.close()
119 | 
120 | 
121 | def test_db_delete_multiple(db_path):
122 |     db = EmbeDB(db_path, readonly=False)
123 |     db.delete(["a", "b"])
124 |     assert "a" not in db
125 |     assert "b" not in db
126 |     db.close()
127 | 
128 | 
129 | def test_db_delete_non_existing(db_path):
130 |     db = EmbeDB(db_path, readonly=False)
131 |     deleted = db.delete(["a", "b", "c"])
132 |     assert deleted == [True, True, False]
133 |     db.close()
134 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | lmdb>=1.3.0
2 | numpy>=1.20.0
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from setuptools import find_packages, setup
 4 | from embedb import __version__
 5 | 
 6 | # Package meta-data.
 7 | NAME = "embedb"
 8 | DESCRIPTION = "EmbeDB is a small Python wrapper around LMDB built as key-value storage for embeddings."
 9 | URL = "https://github.com/ntropy-network/embedb"
10 | REQUIRES_PYTHON = ">=3.7.0"
11 | PROJECT_ROOT = os.path.abspath(os.path.dirname(__file__))
12 | 
13 | 
14 | def load_requirements(filename):
15 |     with open(os.path.join(PROJECT_ROOT, filename), "r") as f:
16 |         lineiter = f.read().splitlines()
17 |     return [line for line in lineiter if line and not line.startswith("#")]
18 | 
19 | 
20 | setup(
21 |     name=NAME,
22 |     version=__version__,
23 |     description=DESCRIPTION,
24 |     long_description=DESCRIPTION,
25 |     long_description_content_type="text/markdown",
26 |     keywords=[
27 |         "Machine Learning",
28 |         "LMDB",
29 |         "Embeddings",
30 |         "Databases",
31 |     ],
32 |     python_requires=REQUIRES_PYTHON,
33 |     url=URL,
34 |     packages=find_packages(),
35 |     install_requires=load_requirements("requirements.txt"),
36 |     include_package_data=True,
37 | )


--------------------------------------------------------------------------------