├── .gitignore ├── pub_test.py ├── setup.py ├── README.md ├── LICENSE ├── vdblite └── __init__.py └── test.py /.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | dist/ 3 | vdblite.egg-info/ 4 | vdblite/__pycache__/ -------------------------------------------------------------------------------- /pub_test.py: -------------------------------------------------------------------------------- 1 | import vdblite 2 | import pickle 3 | import numpy as np 4 | from time import time 5 | from uuid import uuid4 6 | import sys 7 | from pprint import pprint as pp 8 | 9 | 10 | if __name__ == '__main__': 11 | vdb = vdblite.Vdb() 12 | dimension = 12 # dimensions of each vector 13 | n = 200 # number of vectors 14 | np.random.seed(1) 15 | db_vectors = np.random.random((n, dimension)).astype('float32') 16 | print(db_vectors[0]) 17 | for vector in db_vectors: 18 | info = {'vector': vector, 'time': time(), 'uuid': str(uuid4())} 19 | vdb.add(info) 20 | vdb.details() 21 | results = vdb.search(db_vectors[10]) 22 | pp(results) -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | with open("README.md", "r") as fh: 4 | long_description = fh.read() 5 | 6 | 7 | """ 8 | cheat sheet reminder for myself because I'm dumb 9 | 10 | python setup.py sdist bdist_wheel 11 | python -m twine upload dist/* 12 | """ 13 | 14 | 15 | setup(name='vdblite', 16 | version='0.1', 17 | description='Vector Database Lite', 18 | url='https://github.com/daveshap/VDBLITE', 19 | long_description=long_description, 20 | long_description_content_type="text/markdown", 21 | author='David Shapiro', 22 | author_email='noone@gmail.com', 23 | license='MIT', 24 | packages=['vdblite'], 25 | install_requires=["numpy", "faiss-cpu"], 26 | zip_safe=False) 27 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Vector Database Lite (VDBLITE) 2 | 3 | Vector Database Lite (like SQLITE but for vector search) 4 | 5 | 6 | ## Quickstart 7 | 8 | 9 | 1. Install using `pip install vdblite` 10 | 2. Run a test with the following code: 11 | 12 | ```python 13 | import vdblite 14 | import pickle 15 | import numpy as np 16 | from time import time 17 | from uuid import uuid4 18 | import sys 19 | from pprint import pprint as pp 20 | 21 | 22 | if __name__ == '__main__': 23 | vdb = vdblite.Vdb() 24 | dimension = 12 # dimensions of each vector 25 | n = 200 # number of vectors 26 | np.random.seed(1) 27 | db_vectors = np.random.random((n, dimension)).astype('float32') 28 | print(db_vectors[0]) 29 | for vector in db_vectors: 30 | info = {'vector': vector, 'time': time(), 'uuid': str(uuid4())} 31 | vdb.add(info) 32 | vdb.details() 33 | results = vdb.search(db_vectors[10]) 34 | pp(results) 35 | ``` -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 David Shapiro 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /vdblite/__init__.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import numpy as np 3 | from time import time 4 | import faiss 5 | #from uuid import uuid4 6 | import sys 7 | #from pprint import pprint as pp 8 | 9 | 10 | class Vdb(): 11 | def __init__(self): 12 | self.data = list() 13 | self.index = None 14 | 15 | def add(self, payload): # payload is a DICT 16 | self.data.append(payload) # uuid could be in payload :) 17 | 18 | def delete(self, field, value, firstonly=False): 19 | for i in self.data: 20 | try: 21 | if i[field] == value: # if field == 'timestamp' then value might be 1657225709.8192494 22 | self.data.remove(i) 23 | if firstonly: 24 | return 25 | except: 26 | continue 27 | 28 | def initialize_index(self,field='vector'): 29 | vectors = [i['vector'] for i in self.data] 30 | if len(vectors) != 0: 31 | self.index = faiss.IndexFlatL2(len(vectors)) 32 | 33 | def search(self, vector, field='vector', count=5): 34 | 35 | print(self.index.is_trained) # False 36 | self.index.train() # train on the database vectors 37 | print(self.index.ntotal) # 0 38 | self.index.add() # add the vectors and update the index 39 | print(index.is_trained) # True 40 | print(index.ntotal) # 200 41 | 42 | vectors = [i['vector'] for i in self.data] 43 | self.index = faiss.IndexFlatL2(vectors) 44 | 45 | results = list() 46 | for i in self.data: 47 | try: 48 | score = np.dot(i[field], vector) 49 | except Exception as oops: 50 | print(oops) 51 | continue 52 | info = i 53 | info['score'] = score 54 | results.append(info) 55 | ordered = sorted(results, key=lambda d: d['score'], reverse=True) 56 | try: 57 | ordered = ordered[0:count] 58 | return ordered 59 | except: 60 | return ordered 61 | 62 | def purge(self): 63 | self.data = list() 64 | 65 | def save(self, filepath): 66 | with open(filepath, 'wb') as outfile: 67 | pickle.dump(self.data, outfile) 68 | 69 | def load(self, filepath): 70 | with open(filepath, 'wb') as infile: 71 | self.data = pickle.load(infile) 72 | 73 | def details(self): 74 | print('DB elements #:', len(self.data)) 75 | print('DB size in memory:', sys.getsizeof(self.data), 'bytes') 76 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | # in-memory vector similarity search 2 | # objects and/or metadata attached to vectors 3 | 4 | # row 1: "text info here", <>, <> 5 | 6 | # row 2: "AGI memory 1", <>, <>, <>, <> 7 | 8 | 9 | import pickle 10 | import numpy as np 11 | from time import time 12 | from uuid import uuid4 13 | import sys 14 | from pprint import pprint as pp 15 | 16 | 17 | class Vdb(): 18 | def __init__(self): 19 | self.data = list() 20 | 21 | def add(self, payload): # payload is a DICT 22 | self.data.append(payload) # uuid could be in payload :) 23 | 24 | def delete(self, field, value, firstonly=False): 25 | for i in self.data: 26 | try: 27 | if i[field] == value: # if field == 'timestamp' then value might be 1657225709.8192494 28 | self.data.remove(i) 29 | if firstonly: 30 | return 31 | except: 32 | continue 33 | 34 | def search(self, vector, field='vector', count=5): 35 | results = list() 36 | for i in self.data: 37 | try: 38 | score = np.dot(i[field], vector) 39 | except Exception as oops: 40 | print(oops) 41 | continue 42 | info = i 43 | info['score'] = score 44 | results.append(info) 45 | ordered = sorted(results, key=lambda d: d['score'], reverse=True) 46 | try: 47 | ordered = ordered[0:count] 48 | return ordered 49 | except: 50 | return ordered 51 | 52 | def purge(self): 53 | self.data = list() 54 | 55 | def save(self, filepath): 56 | with open(filepath, 'wb') as outfile: 57 | pickle.dump(self.data, outfile) 58 | 59 | def load(self, filepath): 60 | with open(filepath, 'wb') as infile: 61 | self.data = pickle.load(infile) 62 | 63 | def details(self): 64 | print('DB elements #:', len(self.data)) 65 | print('DB size in memory:', sys.getsizeof(self.data), 'bytes') 66 | 67 | 68 | 69 | if __name__ == '__main__': 70 | vdb = Vdb() 71 | dimension = 12 # dimensions of each vector 72 | n = 200 # number of vectors 73 | np.random.seed(1) 74 | db_vectors = np.random.random((n, dimension)).astype('float32') 75 | print(db_vectors[0]) 76 | for vector in db_vectors: 77 | info = {'vector': vector, 'time': time(), 'uuid': str(uuid4())} 78 | vdb.add(info) 79 | vdb.details() 80 | results = vdb.search(db_vectors[10]) 81 | pp(results) --------------------------------------------------------------------------------