├── README.md ├── examples └── openai_embeddings_from_text.py ├── pyghostdb ├── __init__.py ├── ghost_storage.py ├── hnswlib_index.py ├── parquet_conversion.py ├── test_hnsw.py └── text_storage.py └── pyproject.toml /README.md: -------------------------------------------------------------------------------- 1 | # GhostDB 2 | 3 | GhostDB is a Python package that provides a fast and efficient way to store and search for embeddings using HNSW (Hierarchical Navigable Small World) indexing from the hnswlib library. It is designed for applications that require fast nearest neighbor search, such as natural language processing, image recognition, and recommendation systems. 4 | 5 | ## Features 6 | 7 | - Fast approximate nearest neighbor search using HNSW indexing 8 | - Support for high-dimensional data 9 | - Persistence of index and text data on disk 10 | - Easy-to-use API for adding, searching, and managing embeddings 11 | - Support for batch operations 12 | 13 | ## Installation 14 | 15 | ```bash 16 | pip install pyghostdb 17 | ``` 18 | 19 | ## Usage 20 | 21 | Here is an example of how to use GhostDB: 22 | 23 | ```python 24 | import numpy as np 25 | from pyghostdb.ghost_storage import GhostStorage 26 | 27 | # Initialize GhostStorage with default settings 28 | ghost_storage = GhostStorage() 29 | 30 | # Add an embedding to the storage 31 | text_id = 1 32 | text = "Sample text" 33 | embedding = np.random.rand(1536) 34 | ghost_storage.add(text_id, text, embedding) 35 | 36 | # Search for the nearest neighbor of a query embedding 37 | query_embedding = np.random.rand(1536) 38 | result = ghost_storage.search(query_embedding, k=1) 39 | 40 | # Get the text and embedding of the nearest neighbor 41 | nearest_text, nearest_embedding = result[0] 42 | print(f"Nearest text: {nearest_text}, Nearest embedding: {nearest_embedding}") 43 | 44 | # Persist the index and text storage to disk 45 | ghost_storage.persist() 46 | 47 | # Load the index and text storage from disk 48 | ghost_storage.load() 49 | ``` 50 | 51 | ## API Reference 52 | 53 | ### `class GhostStorage` 54 | 55 | The main class for storing and searching embeddings. 56 | 57 | #### `__init__(self, dim=1536, max_elements=10**5, persist_dir="ghost_dir")` 58 | 59 | Initialize the GhostStorage instance. 60 | 61 | - `dim`: The dimension of the embeddings (default: 1536) 62 | - `max_elements`: The maximum number of elements that can be stored in the index (default: 10^5) 63 | - `persist_dir`: The directory where the index and text storage will be persisted (default: "ghost_dir") 64 | 65 | #### `add(self, text_id, text, embedding)` 66 | 67 | Add a single text and its embedding to the storage. 68 | 69 | - `text_id`: The unique identifier of the text 70 | - `text`: The text associated with the embedding 71 | - `embedding`: The embedding as a numpy array or list 72 | 73 | #### `add_multiple(self, ids: list[int], texts: list[str], embeddings: np.ndarray)` 74 | 75 | Add multiple texts and their embeddings to the storage. 76 | 77 | - `ids`: A list of unique identifiers for the texts 78 | - `texts`: A list of texts associated with the embeddings 79 | - `embeddings`: A numpy array containing the embeddings 80 | 81 | #### `search(self, embedding, k=1)` 82 | 83 | Search for the k nearest neighbors of a query embedding. 84 | 85 | - `embedding`: The query embedding as a numpy array or list 86 | - `k`: The number of nearest neighbors to search for (default: 1) 87 | 88 | Returns a list of tuples containing the text and its embedding for each nearest neighbor. 89 | 90 | #### `clear(self)` 91 | 92 | Clear the storage and remove the index and text storage from disk. 93 | 94 | #### `persist(self)` 95 | 96 | Persist the index and text storage to disk. 97 | 98 | #### `load(self)` 99 | 100 | Load the index and text storage from disk. 101 | -------------------------------------------------------------------------------- /examples/openai_embeddings_from_text.py: -------------------------------------------------------------------------------- 1 | from pyghostdb.ghost_storage import GhostStorage 2 | from openai.api_resources.embedding import Embedding 3 | import numpy as np 4 | 5 | 6 | def calculate_embeddings_from_texts(texts: list[str]): 7 | # replace newlines, which can negatively affect performance. 8 | texts = [t.replace("\n", " ") for t in texts] 9 | # Call the OpenAI Embedding API in parallel for each document 10 | return [ 11 | result["embedding"] 12 | for result in Embedding.create( 13 | input=texts, 14 | engine='text-embedding-ada-002', 15 | )["data"] 16 | ] 17 | 18 | 19 | if __name__ == "__main__": 20 | # Initialize GhostStorage with default settings 21 | ghost_storage = GhostStorage() 22 | 23 | # Prepare some texts 24 | texts = ["Hello world!", "I love to code in Python", "Artificial Intelligence is fascinating"] 25 | 26 | # Calculate embeddings for the texts 27 | embeddings = calculate_embeddings_from_texts(texts) 28 | 29 | # Add texts and their embeddings to the storage 30 | for i, (text, embedding) in enumerate(zip(texts, embeddings)): 31 | ghost_storage.upsert(i, text, embedding) 32 | 33 | # Search for the nearest neighbor of a query embedding 34 | query_embedding = np.random.rand(1536) 35 | result = ghost_storage.search(query_embedding, k=1) 36 | 37 | # Get the text and embedding of the nearest neighbor 38 | _, nearest_text, nearest_embedding = result[0] 39 | print(f"Nearest text: {nearest_text}, Nearest embedding: {nearest_embedding}") 40 | 41 | # Persist the index and text storage to disk 42 | ghost_storage.persist() 43 | 44 | # Load the index and text storage from disk 45 | ghost_storage.load() 46 | -------------------------------------------------------------------------------- /pyghostdb/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ghost-db/ghostdb/6f1db3e18f782e823d75edbce23decefed3640c2/pyghostdb/__init__.py -------------------------------------------------------------------------------- /pyghostdb/ghost_storage.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from pyghostdb.hnswlib_index import HNSWIndex 4 | from pyghostdb.text_storage import TextStorage 5 | import os 6 | 7 | 8 | class GhostStorage: 9 | hnsw_index = None 10 | text_storage_db = None 11 | 12 | def __init__(self, dim=1536, max_elements=10**5, persist_dir="ghost_dir"): 13 | self.hnsw_index = HNSWIndex(dim=dim, max_elements=max_elements, ef=200, M=16) 14 | self.hnsw_index.init_index() 15 | self.persist_dir = persist_dir 16 | if not os.path.exists(self.persist_dir): 17 | os.makedirs(self.persist_dir) 18 | self.text_storage_db = TextStorage(self.text_storage_filepath()) 19 | if self.hnws_index_filepath_exists(): 20 | self.load() 21 | 22 | def upsert(self, text_id, text, embedding): 23 | if not isinstance(embedding, np.ndarray): 24 | try: 25 | embedding = np.array(embedding) 26 | except Exception as e: 27 | print(e) 28 | raise TypeError("embedding must be a numpy array or a list") 29 | if not embedding.shape[0] == self.hnsw_index.dim: 30 | raise ValueError("embedding must have the same dimension as the hnsw index") 31 | if self.hnsw_index is None: 32 | raise AttributeError("HNSW index is not initialized, please call load method") 33 | # convert numpy array to list of floats 34 | embedding = [float(x) for x in list(embedding.reshape(-1))] 35 | self.text_storage_db.add(text_id, text, embedding) 36 | self.hnsw_index.add_items(embedding, [text_id]) 37 | 38 | def upsert_overwrite_bulk(self, ids: list[int], texts: list[str], embeddings: np.ndarray): 39 | # convert numpy array to list of floats 40 | self.hnsw_index.add_items(embeddings, ids) 41 | embeddings = [list(embedding) for embedding in embeddings] 42 | self.text_storage_db.add_multiple(ids, texts, embeddings) 43 | 44 | def search(self, embedding, k=1): 45 | # check if embedding is a list 46 | if isinstance(embedding, list): 47 | embedding = np.array(embedding) 48 | ids, distance = self.hnsw_index.knn_query(embedding, k=k) 49 | # convert ids to list of ints 50 | ids = [int(id_) for id_ in ids.flatten().tolist()] 51 | return [self.text_storage_db.get(id_) for id_ in ids] 52 | 53 | def clear(self): 54 | self.hnsw_index = None 55 | self.text_storage_db.remove_db() 56 | 57 | def hnws_index_filepath(self): 58 | return os.path.join(self.persist_dir, "index.ghostdb") 59 | 60 | def hnws_index_filepath_exists(self): 61 | return os.path.exists(self.hnws_index_filepath()) 62 | 63 | def text_storage_filepath(self): 64 | return os.path.join(self.persist_dir, "text_storage.db") 65 | 66 | def persist(self): 67 | self.hnsw_index.save_to_file(self.hnws_index_filepath()) 68 | 69 | def load(self): 70 | self.hnsw_index.load_from_file(self.hnws_index_filepath()) 71 | -------------------------------------------------------------------------------- /pyghostdb/hnswlib_index.py: -------------------------------------------------------------------------------- 1 | import hnswlib 2 | import numpy as np 3 | 4 | 5 | class HNSWIndex: 6 | def __init__(self, dim, max_elements, ef=200, M=16): 7 | """ 8 | :param dim: embedding dimension 9 | :param max_elements: maximum number of elements in the index 10 | :param ef: the size of the dynamic list for the nearest neighbors (used during the search). Higher ef leads to more accurate but slower search. ef cannot be set lower than the number of queried nearest neighbors k. The value ef of can be anything between k and the size of the dataset. 11 | 12 | :param M: the number of bi-directional links created for every new element during construction. 13 | Reasonable range for M is 2-100. Higher M work better on datasets with high intrinsic dimensionality and/or high recall, while low M work better for datasets with low intrinsic dimensionality and/or low recalls. The parameter also determines the algorithm's memory consumption, which is roughly M * 8-10 bytes per stored element. 14 | 15 | """ 16 | self.dim = dim 17 | self.max_elements = max_elements 18 | self.ef = ef 19 | self.M = M 20 | self.index = None 21 | 22 | def init_index(self): 23 | """ 24 | Initialize an empty hnswlib index 25 | """ 26 | self.index = hnswlib.Index(space='cosine', dim=self.dim) 27 | self.index.init_index(max_elements=self.max_elements, ef_construction=self.ef, M=self.M) 28 | 29 | def build_index(self, embedding_vectors): 30 | """ 31 | Build an hnswlib index from a set of embedding vectors 32 | """ 33 | assert embedding_vectors.shape[1] == self.dim 34 | 35 | num_elements, dim = embedding_vectors.shape 36 | self.index = hnswlib.Index(space='cosine', dim=dim) # possible options are l2, cosine or ip 37 | self.index.init_index(max_elements=num_elements, ef_construction=self.ef, M=self.M) 38 | self.index.add_items(embedding_vectors) 39 | self.index.set_ef(self.ef) # ef should always be > k 40 | 41 | def add_items(self, embedding_vectors, ids=None): 42 | """ 43 | Add new embedding vectors to the index 44 | """ 45 | # assert embedding_vectors.shape[1] == self.dim 46 | 47 | self.index.add_items(embedding_vectors, ids=ids) 48 | 49 | def knn_query(self, embedding_vectors, k=1): 50 | """ 51 | Query the index for the k nearest neighbors of the given embedding vectors 52 | """ 53 | # check if 2d or 1d array 54 | if len(embedding_vectors.shape) == 1: 55 | embedding_vectors = np.expand_dims(embedding_vectors, axis=0) 56 | if embedding_vectors.shape[1] != self.dim: 57 | raise ValueError("embedding_vectors must be of shape (num_vectors, {})".format(self.dim)) 58 | if k > self.ef: 59 | raise ValueError("k must be less than or equal to ef") 60 | 61 | labels, distances = self.index.knn_query(embedding_vectors, k=k) 62 | return labels, distances 63 | 64 | def save_to_file(self, path): 65 | """ 66 | Save the index to disk 67 | """ 68 | self.index.save_index(path) 69 | 70 | def load_from_file(self, path): 71 | """ 72 | Load the index from disk 73 | """ 74 | self.index.load_index(path) 75 | -------------------------------------------------------------------------------- /pyghostdb/parquet_conversion.py: -------------------------------------------------------------------------------- 1 | import duckdb 2 | import pyarrow as pa 3 | import pyarrow.parquet 4 | 5 | 6 | def write_to_parquet(text_ids: list[int], texts: list[str], embeddings: list[list[float]], output_file: str): 7 | schema = pa.schema([ 8 | ('id_', pa.int32()), 9 | ('text', pa.string()), 10 | ('embedding', pa.list_(pa.float32())), 11 | ]) 12 | data = { 13 | 'id_': text_ids, 14 | 'text': texts, 15 | 'embedding': embeddings, 16 | } 17 | # Create Arrow arrays from the dictionary 18 | arrays = [pa.array(data[column], type=field_type) for column, field_type in zip(data.keys(), schema.types)] 19 | 20 | # Create an Arrow RecordBatch 21 | record_batch = pa.RecordBatch.from_arrays(arrays, schema=schema) 22 | 23 | # Convert the RecordBatch to an Arrow Table 24 | table = pa.Table.from_batches([record_batch]) 25 | 26 | # Save the Table as a Parquet file 27 | pa.parquet.write_table(table, output_file) 28 | 29 | 30 | def from_parquet_to_duckdb(parquet_file: str, duckdb_file: str, table_name: str): 31 | # Connect to DuckDB (this will create a new file if it doesn't exist) 32 | conn = duckdb.connect(duckdb_file) 33 | 34 | conn.execute(f''' DROP TABLE IF EXISTS {table_name} ''') 35 | conn.close() 36 | 37 | conn = duckdb.connect(duckdb_file) 38 | # Create a table with the appropriate schema 39 | conn.execute(f''' 40 | CREATE TABLE IF NOT EXISTS {table_name} (id_ INTEGER PRIMARY KEY, text TEXT, embedding DOUBLE[]); 41 | ''') 42 | 43 | # Import the Parquet file into the table 44 | conn.execute(f''' 45 | COPY {table_name} FROM '{parquet_file}' (FORMAT 'parquet'); 46 | ''') 47 | 48 | # Close the connection 49 | conn.close() 50 | 51 | 52 | # (id_, text, vector_data) 53 | 54 | 55 | if __name__ == "__main__": 56 | # Define your dictionary 57 | data = { 58 | 'text': ['sample text 1', 'sample text 2'], 59 | 'embedding': [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]], 60 | 'id_': [1, 2] 61 | } 62 | 63 | # Create an Arrow schema 64 | schema = pa.schema([ 65 | ('id_', pa.int32()), 66 | ('text', pa.string()), 67 | ('embedding', pa.list_(pa.float64())), 68 | ]) 69 | 70 | write_to_parquet(data['id_'], data['text'], data['embedding'], "test.parquet") 71 | 72 | from_parquet_to_duckdb("test.parquet", "my_database.duckdb", "my_table3355") 73 | # print the duckdb table 74 | conn = duckdb.connect("my_database.duckdb") 75 | print(conn.execute("SELECT * FROM my_table3355").fetchall()) 76 | conn.close() 77 | -------------------------------------------------------------------------------- /pyghostdb/test_hnsw.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from ghost_storage import GhostStorage 4 | from hnswlib_index import HNSWIndex 5 | from text_storage import TextStorage 6 | 7 | 8 | def test_hnsw(): 9 | n_elements = 100 10 | index = HNSWIndex(dim=1024, max_elements=n_elements) 11 | index.build_index(np.float32(np.random.random((n_elements, 1024)))) 12 | test_data = np.float32(np.random.random((100, 1024))) 13 | labels1, distances = index.knn_query(test_data, k=1) 14 | index.add_items 15 | 16 | 17 | def test_small_vectors(): 18 | n_elements = 100 19 | index = HNSWIndex(dim=3, max_elements=n_elements) 20 | index.build_index(np.float32(np.random.random((n_elements, 3)))) 21 | test_data = np.float32(np.random.random((100, 3))) 22 | labels1, distances = index.knn_query(test_data, k=1) 23 | print(distances) 24 | 25 | 26 | def test_small_vectors2(): 27 | n_elements = 100 28 | index = HNSWIndex(dim=3, max_elements=n_elements) 29 | index.init_index() 30 | index.add_items(np.float32(np.random.random((n_elements, 3))), list(map(lambda x: str(x), range(n_elements)))) 31 | test_data = np.float32(np.random.random((5, 3))) 32 | labels1, distances = index.knn_query(test_data, k=3) 33 | print(labels1) 34 | 35 | 36 | def test_text_storage(): 37 | embedding = [1.23, 4.56, 7.89] 38 | text = "hello world" 39 | id_ = "123" 40 | text_storage = TextStorage(db_path="trademark.db") 41 | text_storage.add(id_=id_, text=text, embedding=embedding) 42 | id_, text, vector_data = text_storage.load(id_=id_) 43 | text_storage.remove_db() 44 | print(id_, text, vector_data) 45 | 46 | 47 | def test_ghostdb(): 48 | gs = GhostStorage(dim=3, max_elements=100) 49 | gs.upsert(1, 'hello world', np.random.random((3, 1))) 50 | gs.upsert(2, 'hello world 2', np.random.random((3, 1))) 51 | gs.upsert(3, 'hello world 3', np.random.random((3, 1))) 52 | gs.upsert(4, 'hello world 4', np.random.random((3, 1))) 53 | gs.upsert(5, 'hello world 5', np.random.random((3, 1))) 54 | gs.upsert(5, 'hello world 6', np.random.random((3, 1))) 55 | 56 | print(gs.search(np.array([1, 2, 3]), k=1)) 57 | 58 | 59 | def test_add_multiple(): 60 | gs = GhostStorage(dim=3, max_elements=100) 61 | gs.upsert_overwrite_bulk([1, 2, 3, 4, 5, 6], 62 | ['hello world', 'hello world 2', 'hello world 3', 'hello world 4', 'hello world 5', 63 | 'hello world 6'], np.random.random((6, 3))) 64 | print(gs.search(np.array([1, 2, 3]), k=1)) 65 | 66 | 67 | if __name__ == '__main__': 68 | test_hnsw() 69 | test_small_vectors2() 70 | test_text_storage() 71 | test_ghostdb() 72 | test_add_multiple() 73 | -------------------------------------------------------------------------------- /pyghostdb/text_storage.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import duckdb 4 | 5 | from pyghostdb.parquet_conversion import write_to_parquet, from_parquet_to_duckdb 6 | 7 | 8 | class TextStorage: 9 | def __init__(self, db_path: str): 10 | self.db_path = db_path 11 | 12 | def add(self, id_: str, text: str, embedding: list[float]): 13 | connection = duckdb.connect(self.db_path) 14 | cursor = connection.cursor() 15 | 16 | # Create a table with an ARRAY column to store the vector 17 | cursor.execute("CREATE TABLE IF NOT EXISTS vectors (id_ INTEGER PRIMARY KEY, text TEXT, embedding DOUBLE[])") 18 | 19 | # remove the old vector from the table if it exists 20 | cursor.execute("DELETE FROM vectors WHERE id_ = ?", (id_,)) 21 | # Insert the vector into the table 22 | cursor.execute(""" 23 | INSERT INTO vectors (id_, text, embedding) 24 | VALUES (?, ?, ?) 25 | """, (id_, text, embedding)) 26 | 27 | connection.commit() 28 | 29 | def add_multiple(self, ids: list[int], texts: list[str], embeddings: list[list[float]]): 30 | # write to parquet 31 | parquet_filepath = f"{self.db_path}_tmp.parquet" 32 | write_to_parquet(ids, texts, embeddings, parquet_filepath) 33 | from_parquet_to_duckdb(parquet_filepath, self.db_path, "vectors") 34 | 35 | def get(self, id_): 36 | connection = duckdb.connect(self.db_path) 37 | cursor = connection.cursor() 38 | 39 | id_int = int(id_) 40 | # Load the vector from the table 41 | cursor.execute("SELECT id_, text, embedding FROM vectors WHERE id_ = ?", (id_int,)) 42 | id_, text, vector_data = cursor.fetchone() 43 | 44 | return id_, text, vector_data 45 | 46 | def get_text(self, id_): 47 | connection = duckdb.connect(self.db_path) 48 | cursor = connection.cursor() 49 | 50 | id_int = int(id_) 51 | # Load the vector from the table 52 | cursor.execute("SELECT id_, text FROM vectors WHERE id_ = ?", (id_int,)) 53 | text = cursor.fetchone() 54 | 55 | return text 56 | 57 | def load(self, id_: str): 58 | connection = duckdb.connect(self.db_path) 59 | cursor = connection.cursor() 60 | 61 | # Load the vector from the table 62 | cursor.execute("SELECT id_, text, embedding FROM vectors WHERE id_ = ?", (id_,)) 63 | id_, text, vector_data = cursor.fetchone() 64 | 65 | return id_, text, vector_data 66 | 67 | def remove_db(self): 68 | os.remove(self.db_path) 69 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools", "wheel"] 3 | 4 | [project] 5 | name = "pyghostdb" 6 | version = "0.0.5" 7 | description = "GhostDB is a fast, lightweight embedding similarity search database." 8 | authors = [{name="Fedor Shabashev", email="fedor.shabashev@gmail.com"}] 9 | 10 | dependencies = [ 11 | "numpy >=1.21.0", 12 | "hnswlib >=0.5.2", 13 | "duckdb >=0.2.9", 14 | "pyarrow>=5.0.0", 15 | ] 16 | 17 | [project.urls] 18 | GitHub = "https://github.com/ghost-db/ghostdb" 19 | --------------------------------------------------------------------------------