├── .gitignore
├── pub_test.py
├── setup.py
├── README.md
├── LICENSE
├── vdblite
    └── __init__.py
└── test.py


/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | dist/
3 | vdblite.egg-info/
4 | vdblite/__pycache__/


--------------------------------------------------------------------------------
/pub_test.py:
--------------------------------------------------------------------------------
 1 | import vdblite
 2 | import pickle
 3 | import numpy as np
 4 | from time import time
 5 | from uuid import uuid4
 6 | import sys
 7 | from pprint import pprint as pp
 8 | 
 9 | 
10 | if __name__ == '__main__':
11 |     vdb = vdblite.Vdb()
12 |     dimension = 12    # dimensions of each vector                         
13 |     n = 200    # number of vectors                   
14 |     np.random.seed(1)             
15 |     db_vectors = np.random.random((n, dimension)).astype('float32')
16 |     print(db_vectors[0])
17 |     for vector in db_vectors:
18 |         info = {'vector': vector, 'time': time(), 'uuid': str(uuid4())}
19 |         vdb.add(info)
20 |     vdb.details()
21 |     results = vdb.search(db_vectors[10])
22 |     pp(results)


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | with open("README.md", "r") as fh:
 4 |     long_description = fh.read()
 5 | 
 6 | 
 7 | """
 8 | cheat sheet reminder for myself because I'm dumb
 9 | 
10 | python setup.py sdist bdist_wheel
11 | python -m twine upload dist/*
12 | """
13 | 
14 | 
15 | setup(name='vdblite',
16 |       version='0.1',
17 |       description='Vector Database Lite',
18 |       url='https://github.com/daveshap/VDBLITE',
19 |       long_description=long_description,
20 |       long_description_content_type="text/markdown",
21 |       author='David Shapiro',
22 |       author_email='noone@gmail.com',
23 |       license='MIT',
24 |       packages=['vdblite'],
25 |       install_requires=["numpy", "faiss-cpu"],
26 |       zip_safe=False)
27 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Vector Database Lite (VDBLITE)
 2 | 
 3 | Vector Database Lite (like SQLITE but for vector search)
 4 | 
 5 | 
 6 | ## Quickstart
 7 | 
 8 | 
 9 | 1. Install using `pip install vdblite`
10 | 2. Run a test with the following code:
11 | 
12 | ```python
13 | import vdblite
14 | import pickle
15 | import numpy as np
16 | from time import time
17 | from uuid import uuid4
18 | import sys
19 | from pprint import pprint as pp
20 | 
21 | 
22 | if __name__ == '__main__':
23 |     vdb = vdblite.Vdb()
24 |     dimension = 12    # dimensions of each vector                         
25 |     n = 200    # number of vectors                   
26 |     np.random.seed(1)             
27 |     db_vectors = np.random.random((n, dimension)).astype('float32')
28 |     print(db_vectors[0])
29 |     for vector in db_vectors:
30 |         info = {'vector': vector, 'time': time(), 'uuid': str(uuid4())}
31 |         vdb.add(info)
32 |     vdb.details()
33 |     results = vdb.search(db_vectors[10])
34 |     pp(results)
35 | ```


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 David Shapiro
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/vdblite/__init__.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import numpy as np
 3 | from time import time
 4 | import faiss
 5 | #from uuid import uuid4
 6 | import sys
 7 | #from pprint import pprint as pp
 8 | 
 9 | 
10 | class Vdb():
11 |     def __init__(self):
12 |         self.data = list()
13 |         self.index = None
14 |     
15 |     def add(self, payload):  # payload is a DICT
16 |         self.data.append(payload)  # uuid could be in payload :) 
17 |     
18 |     def delete(self, field, value, firstonly=False):
19 |         for i in self.data:
20 |             try:
21 |                 if i[field] == value:  # if field == 'timestamp' then value might be 1657225709.8192494
22 |                     self.data.remove(i)
23 |                     if firstonly:
24 |                         return
25 |             except:
26 |                 continue
27 |                 
28 |     def initialize_index(self,field='vector'):
29 |         vectors = [i['vector'] for i in self.data]
30 |         if len(vectors) != 0:
31 |             self.index = faiss.IndexFlatL2(len(vectors))
32 |     
33 |     def search(self, vector, field='vector', count=5):
34 |         
35 |         print(self.index.is_trained)   # False
36 |         self.index.train()  # train on the database vectors
37 |         print(self.index.ntotal)   # 0
38 |         self.index.add()   # add the vectors and update the index
39 |         print(index.is_trained)  # True
40 |         print(index.ntotal)   # 200
41 |         
42 |         vectors = [i['vector'] for i in self.data]
43 |         self.index = faiss.IndexFlatL2(vectors)
44 |         
45 |         results = list()
46 |         for i in self.data:
47 |             try:
48 |                 score = np.dot(i[field], vector)
49 |             except Exception as oops:
50 |                 print(oops)
51 |                 continue
52 |             info = i
53 |             info['score'] = score
54 |             results.append(info)
55 |         ordered = sorted(results, key=lambda d: d['score'], reverse=True)
56 |         try:
57 |             ordered = ordered[0:count]
58 |             return ordered
59 |         except:
60 |             return ordered
61 |     
62 |     def purge(self):
63 |         self.data = list()
64 |     
65 |     def save(self, filepath):
66 |         with open(filepath, 'wb') as outfile:
67 |             pickle.dump(self.data, outfile)
68 | 
69 |     def load(self, filepath):
70 |         with open(filepath, 'wb') as infile:
71 |             self.data = pickle.load(infile)
72 | 
73 |     def details(self):
74 |         print('DB elements #:', len(self.data))
75 |         print('DB size in memory:', sys.getsizeof(self.data), 'bytes')
76 | 


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
 1 | # in-memory vector similarity search
 2 | # objects and/or metadata attached to vectors
 3 | 
 4 | # row 1: "text info here", <<vector here>>, <<timestamp>>
 5 | 
 6 | # row 2: "AGI memory 1", <<vector 1 - 512d similarity>>, <<vector 2 - 2048 similarity vector>>, <<timestamps>>, <<filename>>
 7 | 
 8 | 
 9 | import pickle
10 | import numpy as np
11 | from time import time
12 | from uuid import uuid4
13 | import sys
14 | from pprint import pprint as pp
15 | 
16 | 
17 | class Vdb():
18 |     def __init__(self):
19 |         self.data = list()
20 |     
21 |     def add(self, payload):  # payload is a DICT
22 |         self.data.append(payload)  # uuid could be in payload :) 
23 |     
24 |     def delete(self, field, value, firstonly=False):
25 |         for i in self.data:
26 |             try:
27 |                 if i[field] == value:  # if field == 'timestamp' then value might be 1657225709.8192494
28 |                     self.data.remove(i)
29 |                     if firstonly:
30 |                         return
31 |             except:
32 |                 continue
33 |     
34 |     def search(self, vector, field='vector', count=5):
35 |         results = list()
36 |         for i in self.data:
37 |             try:
38 |                 score = np.dot(i[field], vector)
39 |             except Exception as oops:
40 |                 print(oops)
41 |                 continue
42 |             info = i
43 |             info['score'] = score
44 |             results.append(info)
45 |         ordered = sorted(results, key=lambda d: d['score'], reverse=True)
46 |         try:
47 |             ordered = ordered[0:count]
48 |             return ordered
49 |         except:
50 |             return ordered
51 |     
52 |     def purge(self):
53 |         self.data = list()
54 |     
55 |     def save(self, filepath):
56 |         with open(filepath, 'wb') as outfile:
57 |             pickle.dump(self.data, outfile)
58 | 
59 |     def load(self, filepath):
60 |         with open(filepath, 'wb') as infile:
61 |             self.data = pickle.load(infile)
62 | 
63 |     def details(self):
64 |         print('DB elements #:', len(self.data))
65 |         print('DB size in memory:', sys.getsizeof(self.data), 'bytes')
66 | 
67 | 
68 | 
69 | if __name__ == '__main__':
70 |     vdb = Vdb()
71 |     dimension = 12    # dimensions of each vector                         
72 |     n = 200    # number of vectors                   
73 |     np.random.seed(1)             
74 |     db_vectors = np.random.random((n, dimension)).astype('float32')
75 |     print(db_vectors[0])
76 |     for vector in db_vectors:
77 |         info = {'vector': vector, 'time': time(), 'uuid': str(uuid4())}
78 |         vdb.add(info)
79 |     vdb.details()
80 |     results = vdb.search(db_vectors[10])
81 |     pp(results)


--------------------------------------------------------------------------------