├── README.md
├── examples
    └── openai_embeddings_from_text.py
├── pyghostdb
    ├── __init__.py
    ├── ghost_storage.py
    ├── hnswlib_index.py
    ├── parquet_conversion.py
    ├── test_hnsw.py
    └── text_storage.py
└── pyproject.toml


/README.md:
--------------------------------------------------------------------------------
  1 | # GhostDB
  2 | 
  3 | GhostDB is a Python package that provides a fast and efficient way to store and search for embeddings using HNSW (Hierarchical Navigable Small World) indexing from the hnswlib library. It is designed for applications that require fast nearest neighbor search, such as natural language processing, image recognition, and recommendation systems.
  4 | 
  5 | ## Features
  6 | 
  7 | - Fast approximate nearest neighbor search using HNSW indexing
  8 | - Support for high-dimensional data
  9 | - Persistence of index and text data on disk
 10 | - Easy-to-use API for adding, searching, and managing embeddings
 11 | - Support for batch operations
 12 | 
 13 | ## Installation
 14 | 
 15 | ```bash
 16 | pip install pyghostdb
 17 | ```
 18 | 
 19 | ## Usage
 20 | 
 21 | Here is an example of how to use GhostDB:
 22 | 
 23 | ```python
 24 | import numpy as np
 25 | from pyghostdb.ghost_storage import GhostStorage
 26 | 
 27 | # Initialize GhostStorage with default settings
 28 | ghost_storage = GhostStorage()
 29 | 
 30 | # Add an embedding to the storage
 31 | text_id = 1
 32 | text = "Sample text"
 33 | embedding = np.random.rand(1536)
 34 | ghost_storage.add(text_id, text, embedding)
 35 | 
 36 | # Search for the nearest neighbor of a query embedding
 37 | query_embedding = np.random.rand(1536)
 38 | result = ghost_storage.search(query_embedding, k=1)
 39 | 
 40 | # Get the text and embedding of the nearest neighbor
 41 | nearest_text, nearest_embedding = result[0]
 42 | print(f"Nearest text: {nearest_text}, Nearest embedding: {nearest_embedding}")
 43 | 
 44 | # Persist the index and text storage to disk
 45 | ghost_storage.persist()
 46 | 
 47 | # Load the index and text storage from disk
 48 | ghost_storage.load()
 49 | ```
 50 | 
 51 | ## API Reference
 52 | 
 53 | ### `class GhostStorage`
 54 | 
 55 | The main class for storing and searching embeddings.
 56 | 
 57 | #### `__init__(self, dim=1536, max_elements=10**5, persist_dir="ghost_dir")`
 58 | 
 59 | Initialize the GhostStorage instance.
 60 | 
 61 | - `dim`: The dimension of the embeddings (default: 1536)
 62 | - `max_elements`: The maximum number of elements that can be stored in the index (default: 10^5)
 63 | - `persist_dir`: The directory where the index and text storage will be persisted (default: "ghost_dir")
 64 | 
 65 | #### `add(self, text_id, text, embedding)`
 66 | 
 67 | Add a single text and its embedding to the storage.
 68 | 
 69 | - `text_id`: The unique identifier of the text
 70 | - `text`: The text associated with the embedding
 71 | - `embedding`: The embedding as a numpy array or list
 72 | 
 73 | #### `add_multiple(self, ids: list[int], texts: list[str], embeddings: np.ndarray)`
 74 | 
 75 | Add multiple texts and their embeddings to the storage.
 76 | 
 77 | - `ids`: A list of unique identifiers for the texts
 78 | - `texts`: A list of texts associated with the embeddings
 79 | - `embeddings`: A numpy array containing the embeddings
 80 | 
 81 | #### `search(self, embedding, k=1)`
 82 | 
 83 | Search for the k nearest neighbors of a query embedding.
 84 | 
 85 | - `embedding`: The query embedding as a numpy array or list
 86 | - `k`: The number of nearest neighbors to search for (default: 1)
 87 | 
 88 | Returns a list of tuples containing the text and its embedding for each nearest neighbor.
 89 | 
 90 | #### `clear(self)`
 91 | 
 92 | Clear the storage and remove the index and text storage from disk.
 93 | 
 94 | #### `persist(self)`
 95 | 
 96 | Persist the index and text storage to disk.
 97 | 
 98 | #### `load(self)`
 99 | 
100 | Load the index and text storage from disk.
101 | 


--------------------------------------------------------------------------------
/examples/openai_embeddings_from_text.py:
--------------------------------------------------------------------------------
 1 | from pyghostdb.ghost_storage import GhostStorage
 2 | from openai.api_resources.embedding import Embedding
 3 | import numpy as np
 4 | 
 5 | 
 6 | def calculate_embeddings_from_texts(texts: list[str]):
 7 |     # replace newlines, which can negatively affect performance.
 8 |     texts = [t.replace("\n", " ") for t in texts]
 9 |     # Call the OpenAI Embedding API in parallel for each document
10 |     return [
11 |         result["embedding"]
12 |         for result in Embedding.create(
13 |             input=texts,
14 |             engine='text-embedding-ada-002',
15 |         )["data"]
16 |     ]
17 | 
18 | 
19 | if __name__ == "__main__":
20 |     # Initialize GhostStorage with default settings
21 |     ghost_storage = GhostStorage()
22 | 
23 |     # Prepare some texts
24 |     texts = ["Hello world!", "I love to code in Python", "Artificial Intelligence is fascinating"]
25 | 
26 |     # Calculate embeddings for the texts
27 |     embeddings = calculate_embeddings_from_texts(texts)
28 | 
29 |     # Add texts and their embeddings to the storage
30 |     for i, (text, embedding) in enumerate(zip(texts, embeddings)):
31 |         ghost_storage.upsert(i, text, embedding)
32 | 
33 |     # Search for the nearest neighbor of a query embedding
34 |     query_embedding = np.random.rand(1536)
35 |     result = ghost_storage.search(query_embedding, k=1)
36 | 
37 |     # Get the text and embedding of the nearest neighbor
38 |     _, nearest_text, nearest_embedding = result[0]
39 |     print(f"Nearest text: {nearest_text}, Nearest embedding: {nearest_embedding}")
40 | 
41 |     # Persist the index and text storage to disk
42 |     ghost_storage.persist()
43 | 
44 |     # Load the index and text storage from disk
45 |     ghost_storage.load()
46 | 


--------------------------------------------------------------------------------
/pyghostdb/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ghost-db/ghostdb/6f1db3e18f782e823d75edbce23decefed3640c2/pyghostdb/__init__.py


--------------------------------------------------------------------------------
/pyghostdb/ghost_storage.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from pyghostdb.hnswlib_index import HNSWIndex
 4 | from pyghostdb.text_storage import TextStorage
 5 | import os
 6 | 
 7 | 
 8 | class GhostStorage:
 9 |     hnsw_index = None
10 |     text_storage_db = None
11 | 
12 |     def __init__(self, dim=1536, max_elements=10**5, persist_dir="ghost_dir"):
13 |         self.hnsw_index = HNSWIndex(dim=dim, max_elements=max_elements, ef=200, M=16)
14 |         self.hnsw_index.init_index()
15 |         self.persist_dir = persist_dir
16 |         if not os.path.exists(self.persist_dir):
17 |             os.makedirs(self.persist_dir)
18 |         self.text_storage_db = TextStorage(self.text_storage_filepath())
19 |         if self.hnws_index_filepath_exists():
20 |             self.load()
21 | 
22 |     def upsert(self, text_id, text, embedding):
23 |         if not isinstance(embedding, np.ndarray):
24 |             try:
25 |                 embedding = np.array(embedding)
26 |             except Exception as e:
27 |                 print(e)
28 |                 raise TypeError("embedding must be a numpy array or a list")
29 |         if not embedding.shape[0] == self.hnsw_index.dim:
30 |             raise ValueError("embedding must have the same dimension as the hnsw index")
31 |         if self.hnsw_index is None:
32 |             raise AttributeError("HNSW index is not initialized, please call load method")
33 |         # convert numpy array to list of floats
34 |         embedding = [float(x) for x in list(embedding.reshape(-1))]
35 |         self.text_storage_db.add(text_id, text, embedding)
36 |         self.hnsw_index.add_items(embedding, [text_id])
37 | 
38 |     def upsert_overwrite_bulk(self, ids: list[int], texts: list[str], embeddings: np.ndarray):
39 |         # convert numpy array to list of floats
40 |         self.hnsw_index.add_items(embeddings, ids)
41 |         embeddings = [list(embedding) for embedding in embeddings]
42 |         self.text_storage_db.add_multiple(ids, texts, embeddings)
43 | 
44 |     def search(self, embedding, k=1):
45 |         # check if embedding is a list
46 |         if isinstance(embedding, list):
47 |             embedding = np.array(embedding)
48 |         ids, distance = self.hnsw_index.knn_query(embedding, k=k)
49 |         # convert ids to list of ints
50 |         ids = [int(id_) for id_ in ids.flatten().tolist()]
51 |         return [self.text_storage_db.get(id_) for id_ in ids]
52 | 
53 |     def clear(self):
54 |         self.hnsw_index = None
55 |         self.text_storage_db.remove_db()
56 | 
57 |     def hnws_index_filepath(self):
58 |         return os.path.join(self.persist_dir, "index.ghostdb")
59 | 
60 |     def hnws_index_filepath_exists(self):
61 |         return os.path.exists(self.hnws_index_filepath())
62 | 
63 |     def text_storage_filepath(self):
64 |         return os.path.join(self.persist_dir, "text_storage.db")
65 | 
66 |     def persist(self):
67 |         self.hnsw_index.save_to_file(self.hnws_index_filepath())
68 | 
69 |     def load(self):
70 |         self.hnsw_index.load_from_file(self.hnws_index_filepath())
71 | 


--------------------------------------------------------------------------------
/pyghostdb/hnswlib_index.py:
--------------------------------------------------------------------------------
 1 | import hnswlib
 2 | import numpy as np
 3 | 
 4 | 
 5 | class HNSWIndex:
 6 |     def __init__(self, dim, max_elements, ef=200, M=16):
 7 |         """
 8 |         :param dim: embedding dimension
 9 |         :param max_elements: maximum number of elements in the index
10 |         :param ef: the size of the dynamic list for the nearest neighbors (used during the search). Higher ef leads to more accurate but slower search. ef cannot be set lower than the number of queried nearest neighbors k. The value ef of can be anything between k and the size of the dataset.
11 | 
12 |         :param M: the number of bi-directional links created for every new element during construction.
13 |         Reasonable range for M is 2-100. Higher M work better on datasets with high intrinsic dimensionality and/or high recall, while low M work better for datasets with low intrinsic dimensionality and/or low recalls. The parameter also determines the algorithm's memory consumption, which is roughly M * 8-10 bytes per stored element.
14 | 
15 |         """
16 |         self.dim = dim
17 |         self.max_elements = max_elements
18 |         self.ef = ef
19 |         self.M = M
20 |         self.index = None
21 | 
22 |     def init_index(self):
23 |         """
24 |         Initialize an empty hnswlib index
25 |         """
26 |         self.index = hnswlib.Index(space='cosine', dim=self.dim)
27 |         self.index.init_index(max_elements=self.max_elements, ef_construction=self.ef, M=self.M)
28 | 
29 |     def build_index(self, embedding_vectors):
30 |         """
31 |         Build an hnswlib index from a set of embedding vectors
32 |         """
33 |         assert embedding_vectors.shape[1] == self.dim
34 | 
35 |         num_elements, dim = embedding_vectors.shape
36 |         self.index = hnswlib.Index(space='cosine', dim=dim)  # possible options are l2, cosine or ip
37 |         self.index.init_index(max_elements=num_elements, ef_construction=self.ef, M=self.M)
38 |         self.index.add_items(embedding_vectors)
39 |         self.index.set_ef(self.ef)  # ef should always be > k
40 | 
41 |     def add_items(self, embedding_vectors, ids=None):
42 |         """
43 |         Add new embedding vectors to the index
44 |         """
45 |         # assert embedding_vectors.shape[1] == self.dim
46 | 
47 |         self.index.add_items(embedding_vectors, ids=ids)
48 | 
49 |     def knn_query(self, embedding_vectors, k=1):
50 |         """
51 |         Query the index for the k nearest neighbors of the given embedding vectors
52 |         """
53 |         # check if 2d or 1d array
54 |         if len(embedding_vectors.shape) == 1:
55 |             embedding_vectors = np.expand_dims(embedding_vectors, axis=0)
56 |         if embedding_vectors.shape[1] != self.dim:
57 |             raise ValueError("embedding_vectors must be of shape (num_vectors, {})".format(self.dim))
58 |         if k > self.ef:
59 |             raise ValueError("k must be less than or equal to ef")
60 | 
61 |         labels, distances = self.index.knn_query(embedding_vectors, k=k)
62 |         return labels, distances
63 | 
64 |     def save_to_file(self, path):
65 |         """
66 |         Save the index to disk
67 |         """
68 |         self.index.save_index(path)
69 | 
70 |     def load_from_file(self, path):
71 |         """
72 |         Load the index from disk
73 |         """
74 |         self.index.load_index(path)
75 | 


--------------------------------------------------------------------------------
/pyghostdb/parquet_conversion.py:
--------------------------------------------------------------------------------
 1 | import duckdb
 2 | import pyarrow as pa
 3 | import pyarrow.parquet
 4 | 
 5 | 
 6 | def write_to_parquet(text_ids: list[int], texts: list[str], embeddings: list[list[float]], output_file: str):
 7 |     schema = pa.schema([
 8 |         ('id_', pa.int32()),
 9 |         ('text', pa.string()),
10 |         ('embedding', pa.list_(pa.float32())),
11 |     ])
12 |     data = {
13 |         'id_': text_ids,
14 |         'text': texts,
15 |         'embedding': embeddings,
16 |     }
17 |     # Create Arrow arrays from the dictionary
18 |     arrays = [pa.array(data[column], type=field_type) for column, field_type in zip(data.keys(), schema.types)]
19 | 
20 |     # Create an Arrow RecordBatch
21 |     record_batch = pa.RecordBatch.from_arrays(arrays, schema=schema)
22 | 
23 |     # Convert the RecordBatch to an Arrow Table
24 |     table = pa.Table.from_batches([record_batch])
25 | 
26 |     # Save the Table as a Parquet file
27 |     pa.parquet.write_table(table, output_file)
28 | 
29 | 
30 | def from_parquet_to_duckdb(parquet_file: str, duckdb_file: str, table_name: str):
31 |     # Connect to DuckDB (this will create a new file if it doesn't exist)
32 |     conn = duckdb.connect(duckdb_file)
33 | 
34 |     conn.execute(f''' DROP TABLE IF EXISTS {table_name} ''')
35 |     conn.close()
36 | 
37 |     conn = duckdb.connect(duckdb_file)
38 |     # Create a table with the appropriate schema
39 |     conn.execute(f'''
40 |         CREATE TABLE IF NOT EXISTS {table_name}  (id_ INTEGER PRIMARY KEY, text TEXT, embedding DOUBLE[]);
41 |     ''')
42 | 
43 |     # Import the Parquet file into the table
44 |     conn.execute(f'''
45 |         COPY {table_name} FROM '{parquet_file}' (FORMAT 'parquet');
46 |     ''')
47 | 
48 |     # Close the connection
49 |     conn.close()
50 | 
51 | 
52 | # (id_, text, vector_data)
53 | 
54 | 
55 | if __name__ == "__main__":
56 |     # Define your dictionary
57 |     data = {
58 |         'text': ['sample text 1', 'sample text 2'],
59 |         'embedding': [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]],
60 |         'id_': [1, 2]
61 |     }
62 | 
63 |     # Create an Arrow schema
64 |     schema = pa.schema([
65 |         ('id_', pa.int32()),
66 |         ('text', pa.string()),
67 |         ('embedding', pa.list_(pa.float64())),
68 |     ])
69 | 
70 |     write_to_parquet(data['id_'], data['text'], data['embedding'], "test.parquet")
71 | 
72 |     from_parquet_to_duckdb("test.parquet", "my_database.duckdb", "my_table3355")
73 |     # print the duckdb table
74 |     conn = duckdb.connect("my_database.duckdb")
75 |     print(conn.execute("SELECT * FROM my_table3355").fetchall())
76 |     conn.close()
77 | 


--------------------------------------------------------------------------------
/pyghostdb/test_hnsw.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from ghost_storage import GhostStorage
 4 | from hnswlib_index import HNSWIndex
 5 | from text_storage import TextStorage
 6 | 
 7 | 
 8 | def test_hnsw():
 9 |     n_elements = 100
10 |     index = HNSWIndex(dim=1024, max_elements=n_elements)
11 |     index.build_index(np.float32(np.random.random((n_elements, 1024))))
12 |     test_data = np.float32(np.random.random((100, 1024)))
13 |     labels1, distances = index.knn_query(test_data, k=1)
14 |     index.add_items
15 | 
16 | 
17 | def test_small_vectors():
18 |     n_elements = 100
19 |     index = HNSWIndex(dim=3, max_elements=n_elements)
20 |     index.build_index(np.float32(np.random.random((n_elements, 3))))
21 |     test_data = np.float32(np.random.random((100, 3)))
22 |     labels1, distances = index.knn_query(test_data, k=1)
23 |     print(distances)
24 | 
25 | 
26 | def test_small_vectors2():
27 |     n_elements = 100
28 |     index = HNSWIndex(dim=3, max_elements=n_elements)
29 |     index.init_index()
30 |     index.add_items(np.float32(np.random.random((n_elements, 3))), list(map(lambda x: str(x), range(n_elements))))
31 |     test_data = np.float32(np.random.random((5, 3)))
32 |     labels1, distances = index.knn_query(test_data, k=3)
33 |     print(labels1)
34 | 
35 | 
36 | def test_text_storage():
37 |     embedding = [1.23, 4.56, 7.89]
38 |     text = "hello world"
39 |     id_ = "123"
40 |     text_storage = TextStorage(db_path="trademark.db")
41 |     text_storage.add(id_=id_, text=text, embedding=embedding)
42 |     id_, text, vector_data = text_storage.load(id_=id_)
43 |     text_storage.remove_db()
44 |     print(id_, text, vector_data)
45 | 
46 | 
47 | def test_ghostdb():
48 |     gs = GhostStorage(dim=3, max_elements=100)
49 |     gs.upsert(1, 'hello world', np.random.random((3, 1)))
50 |     gs.upsert(2, 'hello world 2', np.random.random((3, 1)))
51 |     gs.upsert(3, 'hello world 3', np.random.random((3, 1)))
52 |     gs.upsert(4, 'hello world 4', np.random.random((3, 1)))
53 |     gs.upsert(5, 'hello world 5', np.random.random((3, 1)))
54 |     gs.upsert(5, 'hello world 6', np.random.random((3, 1)))
55 | 
56 |     print(gs.search(np.array([1, 2, 3]), k=1))
57 | 
58 | 
59 | def test_add_multiple():
60 |     gs = GhostStorage(dim=3, max_elements=100)
61 |     gs.upsert_overwrite_bulk([1, 2, 3, 4, 5, 6],
62 |                              ['hello world', 'hello world 2', 'hello world 3', 'hello world 4', 'hello world 5',
63 |                      'hello world 6'], np.random.random((6, 3)))
64 |     print(gs.search(np.array([1, 2, 3]), k=1))
65 | 
66 | 
67 | if __name__ == '__main__':
68 |     test_hnsw()
69 |     test_small_vectors2()
70 |     test_text_storage()
71 |     test_ghostdb()
72 |     test_add_multiple()
73 | 


--------------------------------------------------------------------------------
/pyghostdb/text_storage.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import duckdb
 4 | 
 5 | from pyghostdb.parquet_conversion import write_to_parquet, from_parquet_to_duckdb
 6 | 
 7 | 
 8 | class TextStorage:
 9 |     def __init__(self, db_path: str):
10 |         self.db_path = db_path
11 | 
12 |     def add(self, id_: str, text: str, embedding: list[float]):
13 |         connection = duckdb.connect(self.db_path)
14 |         cursor = connection.cursor()
15 | 
16 |         # Create a table with an ARRAY column to store the vector
17 |         cursor.execute("CREATE TABLE IF NOT EXISTS vectors (id_ INTEGER PRIMARY KEY, text TEXT, embedding DOUBLE[])")
18 | 
19 |         # remove the old vector from the table if it exists
20 |         cursor.execute("DELETE FROM vectors WHERE id_ = ?", (id_,))
21 |         # Insert the vector into the table
22 |         cursor.execute("""
23 |             INSERT INTO vectors (id_, text, embedding) 
24 |             VALUES (?, ?, ?) 
25 |         """, (id_, text, embedding))
26 | 
27 |         connection.commit()
28 | 
29 |     def add_multiple(self, ids: list[int], texts: list[str], embeddings: list[list[float]]):
30 |         # write to parquet
31 |         parquet_filepath = f"{self.db_path}_tmp.parquet"
32 |         write_to_parquet(ids, texts, embeddings, parquet_filepath)
33 |         from_parquet_to_duckdb(parquet_filepath, self.db_path, "vectors")
34 | 
35 |     def get(self, id_):
36 |         connection = duckdb.connect(self.db_path)
37 |         cursor = connection.cursor()
38 | 
39 |         id_int = int(id_)
40 |         # Load the vector from the table
41 |         cursor.execute("SELECT id_, text, embedding FROM vectors WHERE id_ = ?", (id_int,))
42 |         id_, text, vector_data = cursor.fetchone()
43 | 
44 |         return id_, text, vector_data
45 | 
46 |     def get_text(self, id_):
47 |         connection = duckdb.connect(self.db_path)
48 |         cursor = connection.cursor()
49 | 
50 |         id_int = int(id_)
51 |         # Load the vector from the table
52 |         cursor.execute("SELECT id_, text FROM vectors WHERE id_ = ?", (id_int,))
53 |         text = cursor.fetchone()
54 | 
55 |         return text
56 | 
57 |     def load(self, id_: str):
58 |         connection = duckdb.connect(self.db_path)
59 |         cursor = connection.cursor()
60 | 
61 |         # Load the vector from the table
62 |         cursor.execute("SELECT id_, text, embedding FROM vectors WHERE id_ = ?", (id_,))
63 |         id_, text, vector_data = cursor.fetchone()
64 | 
65 |         return id_, text, vector_data
66 | 
67 |     def remove_db(self):
68 |         os.remove(self.db_path)
69 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools", "wheel"]
 3 | 
 4 | [project]
 5 | name = "pyghostdb"
 6 | version = "0.0.5"
 7 | description = "GhostDB is a fast, lightweight embedding similarity search database."
 8 | authors = [{name="Fedor Shabashev", email="fedor.shabashev@gmail.com"}]
 9 | 
10 | dependencies = [
11 | "numpy >=1.21.0",
12 | "hnswlib >=0.5.2",
13 | "duckdb >=0.2.9",
14 | "pyarrow>=5.0.0",
15 | ]
16 | 
17 | [project.urls]
18 | GitHub = "https://github.com/ghost-db/ghostdb"
19 | 


--------------------------------------------------------------------------------