├── .gitignore ├── LICENSE ├── README.md ├── documents.py ├── meme.jpeg ├── requirements.txt └── vectorstore.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Maharshi Pandya 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # smolvecstore 2 | 3 | a tiny vectorstore implementation built with numpy. 4 | that's it. 5 | 6 | numpy is all you need? 7 | 8 | ![np.array is the best](meme.jpeg) 9 | 10 | 11 | ### motivation 12 | 13 | tired of hearing about all the available vectorstore libraries along with buzzwords thrown around, here is a tiny implementation of a "Vectorstore" built with `numpy` and `sentence-transformers` in python. 14 | 15 | this implementation is only ~100 lines of python code but still works fast enough (on cpu). 16 | 17 | lol, lmao even. 18 | 19 | 20 | ### example 21 | 22 | this is how the code to use this Vectostore will look like. 23 | 24 | **code**: 25 | 26 | ```python 27 | docs = [ 28 | "Super mario is a nice video game.", 29 | "The USA election are on the way!", 30 | "A video game is fun to play with friends.", 31 | "What if the earth was covered with plasma instead of water?" 32 | ] 33 | 34 | vs = Vectorstore.from_docs(docs, embedder=model) 35 | 36 | query = "which is a nice game you can think of?" 37 | similar_docs, scores = vs.search(query, k=2) 38 | ``` 39 | 40 | **output**: 41 | 42 | ``` 43 | Most similar documents: ['A video game is fun to play with friends.', 'Super mario is a nice video game.'] 44 | 45 | Scores w.r.t query (lower is better): [14.200933, 15.170744] 46 | ``` -------------------------------------------------------------------------------- /documents.py: -------------------------------------------------------------------------------- 1 | document = """She found the old lockbox buried under the floorboards, its metal surface tarnished with age. With trembling hands, she twisted the rusty latch and gasped at its contents - a stack of letters tied with faded ribbon, each bearing her grandmother's elegant script from decades past. 2 | The rain drummed an insistent melody on the rooftop as Tim stared out the window. He longed for adventure, for escape from this monotonous life. A flash of movement caught his eye - a folded paper boat drifting down the street, bobbing over puddles with reckless abandon. 3 | In the stillness of the autumn forest, a single crimson leaf drifted lazily downward, twirling and dancing through gilded shafts of sunlight. Lila watched it in silent reverie until it finally came to rest atop her upturned palm - a transient beauty now preserved in memory. 4 | The circus had left town weeks ago, but Timmy swore he could still hear the faint calling of the ringmaster's voice. He followed it into an overgrown field where the tattered remains of a striped tent lay haunting the weeds, casting serpentine shadows in the moonlight. 5 | Emma pressed her palm against the chilled windowpane, mesmerized by the fragile, lacy patterns that the frost had etched across the glass. She yearned to reach through, to trace those intricate whorls and skim her fingertips through the frozen fractals glittering in the winter dawn. 6 | Beneath the inky cloak of midnight, the cove's usually placid waters had turned tumultuous and thick with brutish undercurrents. Yet the old fisherman calmly launched his skiff to wrestle with the gaping maw of the tempest, as if summoned by some primordial calling of the sea. 7 | The attic was a forgotten sanctuary where sunbeams danced through clouds of golden dust. Among the clutter, Alice discovered an intricate jewel-encrusted dagger - surely just an antique stage prop. But when she grasped its ornate hilt, her mind was gripped by visions of torchlit battles. 8 | No one could quite remember when the old railway handcar had been abandoned beside the rusting tracks. But every full moon, the neighborhood kids gathered to take turns pumping the stubborn lever, half-hoping the stubborn thing might finally roll to life with heavy metallic creaks. 9 | Though Jim's body had failed him long ago, he blessed the twilight years that had sharpened his mind's eye. From his worn rocking chair, he constantly reimagined the dance of the clouds into fantastic beasts and sailing ships bound for lands unseen by weathered mariners. 10 | The ramshackle cottage sat placid amid the gradual encroachment of the marsh thickets. Lillian would never glimpse the rambling rose vines that had wound the crumbling chimney, but she could smell their haunting sweet perfume drifting through the skeins of mist swirling beneath the moon's watchful gaze.""" 11 | -------------------------------------------------------------------------------- /meme.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/smolorg/smolvecstore/96cc3b1a1ba1194623f9494b229ab65c16be6f73/meme.jpeg -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.25.2 2 | sentence-transformers==2.7.0 -------------------------------------------------------------------------------- /vectorstore.py: -------------------------------------------------------------------------------- 1 | import time 2 | from enum import Enum 3 | from functools import wraps 4 | from typing import * 5 | 6 | import numpy as np 7 | from sentence_transformers import SentenceTransformer 8 | 9 | from documents import document 10 | 11 | 12 | class SimMetric(Enum): 13 | EUCLIDEAN = 0 14 | COSINE = 1 15 | 16 | 17 | def timeit(func): 18 | @wraps(func) 19 | def inner(*args, **kwargs): 20 | t_start = time.time() 21 | res = func(*args, **kwargs) 22 | t_exec = time.time() - t_start 23 | return res, t_exec * 1000 24 | return inner 25 | 26 | 27 | class Vectorstore: 28 | """ 29 | a lightweight vectorstore built with numpy 30 | """ 31 | def __init__( 32 | self, 33 | docs: List[str], 34 | embedder: SentenceTransformer = None, 35 | similarity_metric: SimMetric = SimMetric.EUCLIDEAN 36 | ) -> None: 37 | self.docs = np.array(docs) 38 | self.embedder = None or embedder 39 | 40 | # main store 41 | self._store: np.ndarray = None 42 | self.similarity_metric = similarity_metric 43 | self._set_sim_func() 44 | 45 | def set_metric(self, metric: SimMetric): 46 | assert isinstance(metric, SimMetric) 47 | self.similarity_metric = metric 48 | self._sim_func = self._set_sim_func() 49 | 50 | def _set_sim_func(self): 51 | if self.similarity_metric == SimMetric.EUCLIDEAN: 52 | self._sim_func = self._dist_euclidean__ 53 | elif self.similarity_metric == SimMetric.COSINE: 54 | self._sim_func = self._cosine__ 55 | else: 56 | NotImplementedError(f"Similarity function for {self.similarity_metric} is not implemented.") 57 | 58 | @classmethod 59 | def from_docs( 60 | cls, 61 | docs: List[str], 62 | embedder: SentenceTransformer = None, 63 | similarity_metric = SimMetric.EUCLIDEAN 64 | ) -> "Vectorstore": 65 | store = cls(docs, embedder=embedder, similarity_metric=similarity_metric) 66 | print(f"Using similarity metric: {similarity_metric}") 67 | return store.build_store() 68 | 69 | def build_store(self): 70 | """ 71 | use this to embed the documents and build a store 72 | """ 73 | if self.embedder is not None: 74 | self._store = self.embedder.encode(self.docs) 75 | 76 | return self 77 | 78 | @timeit 79 | def search(self, query: str, k: int = 5) -> tuple: 80 | """ 81 | get top K similar documents and their scores, semantically similar to query. 82 | the lower score, the better. 83 | """ 84 | 85 | assert self.embedder is not None 86 | assert k >= 1 87 | 88 | q_emb = self.embedder.encode(query) 89 | assert q_emb.ndim == 1 90 | 91 | return self._get_topk_similar(q_emb, k=k) 92 | 93 | def _dist_euclidean__(self, query: np.ndarray): 94 | """ 95 | calculates the distance between all vectors from the store and query 96 | """ 97 | assert query.ndim == 1 98 | assert query.shape[0] == self._store.shape[1], f"Shape mismatch between query and store: {query.shape}, {self._store.shape}" 99 | 100 | # final vector 101 | # dist_squared = (x1 - x2) ** 2 + (y1 - y2) ** 2 + ... 102 | # sum along columns for distance 103 | 104 | dist: np.ndarray = np.sqrt((self._store - query) ** 2).sum(axis=1) 105 | return dist 106 | 107 | def _cosine__(self, query: np.ndarray): 108 | """ 109 | calculates the cosine similarity 110 | """ 111 | assert query.ndim == 1 112 | assert query.shape[0] == self._store.shape[1], f"Shape mismatch between query and store: {query.shape}, {self._store.shape}" 113 | 114 | norm_a = np.linalg.norm(self._store, axis=1) 115 | norm_b = np.linalg.norm(query) 116 | 117 | similarity = np.dot(self._store, query) / norm_a * norm_b 118 | return similarity 119 | 120 | def _get_topk_similar(self, query: np.ndarray, k: int = 5): 121 | """ 122 | given a distance matrix, get top k similar indices from docs 123 | also return distance scores 124 | """ 125 | reverse = False 126 | if self.similarity_metric == SimMetric.COSINE: 127 | reverse = True 128 | 129 | arr = self._sim_func(query) 130 | sorted_indices = np.argsort(arr) 131 | top_k_indices = sorted_indices[ :k] if not reverse else sorted_indices[::-1][:k] 132 | 133 | topk_docs = self.docs[top_k_indices] 134 | topk_dist = arr[top_k_indices] 135 | 136 | assert topk_docs.ndim == 1 and topk_docs.shape[0] == k 137 | assert topk_dist.ndim == 1 138 | 139 | return list(topk_docs), topk_dist 140 | 141 | def __repr__(self) -> str: 142 | return f"Vectorstore(embedder = {self.embedder})" 143 | 144 | 145 | # --------- Usage ----------- 146 | 147 | # example embedder 148 | print(f"Loading the embedding model...") 149 | model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2') 150 | 151 | docs = document.split('\n') 152 | print(f"Building vectorstore for {len(docs)} documents...") 153 | 154 | vs = Vectorstore.from_docs(docs, embedder=model, similarity_metric=SimMetric.COSINE) 155 | 156 | query = "What did emma do in this story?" 157 | result, exectime = vs.search(query, k=2) 158 | 159 | print(f"\nMost similar documents: {result[0]}") 160 | print(f"Scores w.r.t query (lower is better): {list(result[1])}") 161 | print(f"\nSearch time: {exectime} ms") 162 | --------------------------------------------------------------------------------