├── .github └── workflows │ └── test.yml ├── LICENSE ├── README.md ├── embedb ├── __init__.py ├── embedb.py └── test_embedb.py ├── requirements.txt └── setup.py /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Python package 5 | 6 | on: push 7 | 8 | jobs: 9 | test: 10 | 11 | runs-on: ubuntu-latest 12 | strategy: 13 | fail-fast: true 14 | matrix: 15 | python-version: ["3.7", "3.8", "3.9"] 16 | 17 | steps: 18 | - uses: actions/checkout@v2 19 | - name: Set up Python ${{ matrix.python-version }} 20 | uses: actions/setup-python@v2 21 | with: 22 | python-version: ${{ matrix.python-version }} 23 | - name: Install dependencies 24 | run: | 25 | python -m pip install --upgrade pip 26 | python -m pip install flake8 pytest 27 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 28 | if [ -f requirements-test.txt ]; then pip install -r requirements-test.txt; fi 29 | - name: Lint with flake8 30 | run: | 31 | # stop the build if there are Python syntax errors or undefined names 32 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 33 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 34 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 35 | - name: Test with pytest 36 | run: | 37 | pytest -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Ntropy 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # EmbeDB 2 | EmbeDB is a small Python wrapper around [LMDB](https://lmdb.readthedocs.io/) built as key-value storage for embeddings. 3 | 4 | Installation: 5 | `pip install git+https://github.com/ntropy-network/embedb` 6 | 7 | ## Usage 8 | 9 | ```python 10 | from embedb import EmbeDB 11 | import numpy as np 12 | 13 | size = 1000000 14 | vectors = np.random.rand(size, 512).astype('float32') 15 | keys = [str(i) for i in range(size)] 16 | 17 | d = {k: v for k, v in zip(keys, vectors)} 18 | db = EmbeDB("/tmp/test.db", readonly=False) 19 | db.update(d) # faster version of iterations on db[k] = v 20 | 21 | subset = np.random.choice(keys, 10000) 22 | subset_vectors = db[subset] # faster version of [d[k] for k in subset] 23 | ``` 24 | 25 | Basic benchmark: 26 | 27 | ```python 28 | In [2]: !ls -lha /tmp/test.db 29 | -rw-r--r-- 1 Arseny wheel 2.8G Nov 4 18:39 /tmp/test.db 30 | 31 | In [3]: %timeit _ = db[subset] 32 | 21.2 ms ± 598 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) 33 | 34 | In [4]: %timeit _ = [d[k] for k in subset] 35 | 2.57 ms ± 36 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) 36 | ``` 37 | -------------------------------------------------------------------------------- /embedb/__init__.py: -------------------------------------------------------------------------------- 1 | from .embedb import EmbeDB 2 | 3 | __version__ = "0.1.0" 4 | -------------------------------------------------------------------------------- /embedb/embedb.py: -------------------------------------------------------------------------------- 1 | import io 2 | import os.path 3 | from typing import List, Sequence, Union 4 | 5 | import lmdb 6 | import numpy as np 7 | 8 | 9 | def bytes_to_vector(x: bytes) -> np.ndarray: 10 | return np.frombuffer(x, dtype=np.float32) 11 | 12 | 13 | def vector_to_bytes(x: np.ndarray) -> bytes: 14 | return x.tobytes() 15 | 16 | 17 | def ndarray_to_bytes(arr: np.ndarray) -> bytes: 18 | buffer = io.BytesIO() 19 | np.save(buffer, arr) 20 | buffer.seek(0) 21 | return buffer.read() 22 | 23 | 24 | def bytes_to_ndarray(b: bytes) -> np.ndarray: 25 | buffer = io.BytesIO(b) 26 | buffer.seek(0) 27 | return np.load(buffer) 28 | 29 | 30 | class EmbeDB: 31 | """ 32 | A simple file-based key-value store for embeddings. Powered by LMDB. 33 | """ 34 | 35 | def __init__( 36 | self, 37 | path: str, 38 | readonly=True, 39 | size=None, 40 | float_vector_only=True, 41 | encoder=None, 42 | decoder=None, 43 | ): 44 | """ 45 | :param path: Path to the database. 46 | :param readonly: Whether the database is read-only. 47 | :param size: Maximum size of the database in bytes. 48 | :param float_vector_only: When True, only 1D float32 array are allowed (speed optimized). 49 | :param encoder: custom encoder function with a signature (obj => bytes). 50 | :param decoder: custom decoder function with a signature (bytes => obj). 51 | """ 52 | size = size or 10**12 # approx 1 TB 53 | self.path = path 54 | 55 | db_dir = os.path.dirname(path) 56 | os.makedirs(db_dir, exist_ok=True) 57 | 58 | self.env = lmdb.Environment( 59 | path=path, map_size=size, lock=False, readonly=readonly, subdir=False 60 | ) 61 | self.encoder = ( 62 | encoder or vector_to_bytes if float_vector_only else ndarray_to_bytes 63 | ) 64 | self.decoder = ( 65 | decoder or bytes_to_vector if float_vector_only else bytes_to_ndarray 66 | ) 67 | 68 | def __getitem__( 69 | self, item: Union[Sequence[str], str] 70 | ) -> Union[np.ndarray, List[np.ndarray]]: 71 | with self.env.begin(write=False) as txn: 72 | if isinstance(item, str): 73 | res = txn.get(item.encode()) 74 | return self.decoder(res) if res else None 75 | 76 | if isinstance(item, (list, tuple, np.ndarray)): 77 | cur = txn.cursor() 78 | res = [ 79 | self.decoder(v) if v else None 80 | for _, v in cur.getmulti([x.encode() for x in item]) 81 | ] 82 | cur.close() 83 | return res 84 | 85 | raise ValueError(f"Invalid type for item: {type(item)} {item}") 86 | 87 | def __setitem__(self, key: str, value: np.ndarray): 88 | with self.env.begin(write=True) as txn: 89 | txn.put(key.encode(), self.encoder(value)) 90 | 91 | def __contains__(self, item): 92 | return self[item] is not None 93 | 94 | def get(self, key: str) -> np.ndarray: 95 | return self[key] 96 | 97 | def keys(self): 98 | with self.env.begin() as txn: 99 | cur = txn.cursor() 100 | for k, _ in cur: 101 | yield k.decode() 102 | cur.close() 103 | 104 | def values(self): 105 | with self.env.begin() as txn: 106 | cur = txn.cursor() 107 | for _, v in cur: 108 | yield self.decoder(v) 109 | cur.close() 110 | 111 | def items(self): 112 | with self.env.begin() as txn: 113 | cur = txn.cursor() 114 | for k, v in cur: 115 | yield k.decode(), self.decoder(v) 116 | cur.close() 117 | 118 | def update(self, other): 119 | with self.env.begin(write=True) as txn: 120 | cur = txn.cursor() 121 | pairs = [(k.encode(), self.encoder(v)) for k, v in other.items()] 122 | cur.putmulti(pairs) 123 | cur.close() 124 | 125 | def __len__(self): 126 | with self.env.begin() as txn: 127 | length = txn.stat()["entries"] 128 | return length 129 | 130 | def close(self): 131 | self.env.close() 132 | 133 | def __del__(self): 134 | self.close() 135 | 136 | def delete(self, item: Union[Sequence[str], str]): 137 | with self.env.begin(write=True) as txn: 138 | if isinstance(item, str): 139 | return txn.delete(item.encode()) 140 | 141 | if isinstance(item, (list, tuple, np.ndarray)): 142 | return [txn.delete(x.encode()) for x in item] 143 | -------------------------------------------------------------------------------- /embedb/test_embedb.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import tempfile 4 | 5 | import lmdb 6 | import numpy as np 7 | import pytest 8 | 9 | from embedb import EmbeDB 10 | 11 | 12 | @pytest.fixture 13 | def db_path(): 14 | d = tempfile.mkdtemp() 15 | fname = os.path.join(d, "test.rdb") 16 | db = EmbeDB(path=fname, readonly=False) 17 | 18 | db["a"] = np.array([1, 2, 3]).astype(np.float32) 19 | db["b"] = np.array([4, 5, 6]).astype(np.float32) 20 | assert len(db) == 2 21 | db.close() 22 | 23 | yield fname 24 | 25 | shutil.rmtree(d) 26 | 27 | 28 | def test_embedb_getitem_contains(db_path): 29 | db = EmbeDB(db_path) 30 | 31 | assert "a" in db 32 | assert "b" in db 33 | assert db["a"].tolist() == [1, 2, 3] 34 | assert db["b"].tolist() == [4, 5, 6] 35 | db.close() 36 | 37 | 38 | def test_embedb_setitem(db_path): 39 | db = EmbeDB(db_path, readonly=False) 40 | db["c"] = np.array([7, 8, 9]).astype(np.float32) 41 | assert db["c"].tolist() == [7, 8, 9] 42 | db.close() 43 | 44 | 45 | def test_embedb_batch_getitem(db_path): 46 | db = EmbeDB(db_path) 47 | 48 | a, b = db[["a", "b"]] 49 | assert a.tolist() == [1, 2, 3] 50 | assert b.tolist() == [4, 5, 6] 51 | 52 | a, b = db[np.array(["a", "b"])] 53 | assert a.tolist() == [1, 2, 3] 54 | assert b.tolist() == [4, 5, 6] 55 | db.close() 56 | 57 | 58 | def test_embedb_update(db_path): 59 | db = EmbeDB(db_path, readonly=False) 60 | pairs = { 61 | "a": np.array([7, 8, 9]).astype(np.float32), 62 | "b": np.array([10, 11, 12]).astype(np.float32), 63 | } 64 | db.update(pairs) 65 | db.close() 66 | 67 | db = EmbeDB(db_path) 68 | assert db["a"].tolist() == [7, 8, 9] 69 | assert db["b"].tolist() == [10, 11, 12] 70 | 71 | 72 | def test_embedb_iter(db_path): 73 | db = EmbeDB(db_path) 74 | for k, v in db.items(): 75 | assert k in ["a", "b"] 76 | assert v.tolist() in [[1, 2, 3], [4, 5, 6]] 77 | 78 | 79 | def test_embedb_keys(db_path): 80 | db = EmbeDB(db_path) 81 | assert set(db.keys()) == {"a", "b"} 82 | 83 | 84 | def test_embedb_values(db_path): 85 | db = EmbeDB(db_path) 86 | assert [x.tolist() for x in db.values()] == [[1, 2, 3], [4, 5, 6]] 87 | 88 | 89 | def test_multidim_embedb(db_path): 90 | db = EmbeDB(db_path, readonly=False, float_vector_only=False) 91 | db["c"] = np.array([[1, 2], [3, 4]]) 92 | assert db["c"].tolist() == [[1, 2], [3, 4]] 93 | db.close() 94 | 95 | 96 | def test_db_close(db_path): 97 | db = EmbeDB(db_path) 98 | db.close() 99 | with pytest.raises(lmdb.Error): 100 | _ = db["a"] 101 | 102 | 103 | def test_db_contains(db_path): 104 | db = EmbeDB(db_path) 105 | assert "a" in db 106 | assert "b" in db 107 | assert "c" not in db 108 | assert db["c"] is None 109 | assert db.get("c") is None 110 | db.close() 111 | 112 | 113 | def test_db_delete_single(db_path): 114 | db = EmbeDB(db_path, readonly=False) 115 | db.delete("a") 116 | assert "a" not in db 117 | assert "b" in db 118 | db.close() 119 | 120 | 121 | def test_db_delete_multiple(db_path): 122 | db = EmbeDB(db_path, readonly=False) 123 | db.delete(["a", "b"]) 124 | assert "a" not in db 125 | assert "b" not in db 126 | db.close() 127 | 128 | 129 | def test_db_delete_non_existing(db_path): 130 | db = EmbeDB(db_path, readonly=False) 131 | deleted = db.delete(["a", "b", "c"]) 132 | assert deleted == [True, True, False] 133 | db.close() 134 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | lmdb>=1.3.0 2 | numpy>=1.20.0 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from setuptools import find_packages, setup 4 | from embedb import __version__ 5 | 6 | # Package meta-data. 7 | NAME = "embedb" 8 | DESCRIPTION = "EmbeDB is a small Python wrapper around LMDB built as key-value storage for embeddings." 9 | URL = "https://github.com/ntropy-network/embedb" 10 | REQUIRES_PYTHON = ">=3.7.0" 11 | PROJECT_ROOT = os.path.abspath(os.path.dirname(__file__)) 12 | 13 | 14 | def load_requirements(filename): 15 | with open(os.path.join(PROJECT_ROOT, filename), "r") as f: 16 | lineiter = f.read().splitlines() 17 | return [line for line in lineiter if line and not line.startswith("#")] 18 | 19 | 20 | setup( 21 | name=NAME, 22 | version=__version__, 23 | description=DESCRIPTION, 24 | long_description=DESCRIPTION, 25 | long_description_content_type="text/markdown", 26 | keywords=[ 27 | "Machine Learning", 28 | "LMDB", 29 | "Embeddings", 30 | "Databases", 31 | ], 32 | python_requires=REQUIRES_PYTHON, 33 | url=URL, 34 | packages=find_packages(), 35 | install_requires=load_requirements("requirements.txt"), 36 | include_package_data=True, 37 | ) --------------------------------------------------------------------------------