├── .changeset ├── README.md ├── config.json ├── forty-papayas-push.md ├── nine-readers-trade.md ├── six-coins-care.md └── tall-lies-hope.md ├── .gitignore ├── .npmignore ├── .prettierrc ├── CHANGELOG.md ├── LICENSE ├── README.md ├── package-lock.json ├── package.json ├── src ├── cache.ts ├── hnsw.ts ├── index.ts ├── indexedDB.ts └── utils.ts └── tsconfig.json /.changeset/README.md: -------------------------------------------------------------------------------- 1 | # Changesets 2 | 3 | Hello and welcome! This folder has been automatically generated by `@changesets/cli`, a build tool that works 4 | with multi-package repos, or single-package repos to help you version and publish your code. You can 5 | find the full documentation for it [in our repository](https://github.com/changesets/changesets) 6 | 7 | We have a quick list of common questions to get you started engaging with this project in 8 | [our documentation](https://github.com/changesets/changesets/blob/main/docs/common-questions.md) 9 | -------------------------------------------------------------------------------- /.changeset/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://unpkg.com/@changesets/config@2.3.0/schema.json", 3 | "changelog": "@changesets/cli/changelog", 4 | "commit": false, 5 | "fixed": [], 6 | "linked": [], 7 | "access": "public", 8 | "baseBranch": "main", 9 | "updateInternalDependencies": "patch", 10 | "ignore": [] 11 | } 12 | -------------------------------------------------------------------------------- /.changeset/forty-papayas-push.md: -------------------------------------------------------------------------------- 1 | --- 2 | "client-vector-search": patch 3 | --- 4 | 5 | updates the docs and dynamic import for @xenova/transformers 6 | -------------------------------------------------------------------------------- /.changeset/nine-readers-trade.md: -------------------------------------------------------------------------------- 1 | --- 2 | 'client-vector-search': minor 3 | --- 4 | 5 | support for experimental hnsw that runs on node and browser with json and binary serialization opitons 6 | -------------------------------------------------------------------------------- /.changeset/six-coins-care.md: -------------------------------------------------------------------------------- 1 | --- 2 | "client-vector-search": patch 3 | --- 4 | 5 | creates a proper embedding index 6 | -------------------------------------------------------------------------------- /.changeset/tall-lies-hope.md: -------------------------------------------------------------------------------- 1 | --- 2 | "client-vector-search": patch 3 | --- 4 | 5 | adds in-memory index creation and brute force knn search 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | dist 3 | test.js 4 | mock/ 5 | .DS_Store 6 | .pytest_cache/ 7 | test*/ 8 | -------------------------------------------------------------------------------- /.npmignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | dist 3 | test.js 4 | mock/ 5 | .DS_Store 6 | .pytest_cache/ 7 | test*/ 8 | 9 | .github/ 10 | .changeset/ 11 | -------------------------------------------------------------------------------- /.prettierrc: -------------------------------------------------------------------------------- 1 | { 2 | "semi": true, 3 | "trailingComma": "all", 4 | "singleQuote": true, 5 | "printWidth": 80, 6 | "tabWidth": 2 7 | } 8 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # client-vector-search 2 | 3 | ## 0.2.0 4 | 5 | ### Minor Changes 6 | 7 | - support for experimental hnsw that runs on node and browser with json and binary serialization opitons 8 | 9 | ### Patch Changes 10 | 11 | - f09bc2f: updates the docs and dynamic import for @xenova/transformers 12 | - 46e07d6: creates a proper embedding index 13 | - 13bddbb: adds in-memory index creation and brute force knn search 14 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Yusuf Hilmi 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # client-vector-search 2 | 3 | A client side vector search library that can embed, search, and cache. Works on the browser and server side. 4 | 5 | It outperforms OpenAI's text-embedding-ada-002 and is way faster than Pinecone and other VectorDBs. 6 | 7 | I'm the founder of [searchbase.app](https://searchbase.app) and we needed this for our product and customers. We'll be using this library in production. You can be sure it'll be maintained and improved. 8 | 9 | - Embed documents using transformers by default: gte-small (~30mb). 10 | - Calculate cosine similarity between embeddings. 11 | - Create an index and search on the client side 12 | - Cache vectors with browser caching support. 13 | 14 | Lots of improvements are coming! 15 | 16 | ## Roadmap 17 | 18 | Our goal is to build a super simple, fast vector search that works with couple hundred to thousands vectors. ~1k vectors per user covers 99% of the use cases. 19 | 20 | We'll initially keep things super simple and sub 100ms 21 | 22 | ### TODOs 23 | - [ ] add HNSW index that works on node and browser env, don't rely on hnsw binder libs 24 | - [ ] add a proper testing suite and ci/cd for the lib 25 | - [ ] simple health tests 26 | - [ ] mock the @xenova/transformers for jest, it's not happy with it 27 | - [ ] performance tests, recall, memory usage, cpu usage etc. 28 | 29 | 30 | ## Installation 31 | 32 | ```bash 33 | npm i client-vector-search 34 | ``` 35 | 36 | 37 | ## Quickstart 38 | 39 | This library provides a plug-and-play solution for embedding and vector search. It's designed to be easy to use, efficient, and versatile. Here's a quick start guide: 40 | 41 | 42 | ```ts 43 | import { getEmbedding, EmbeddingIndex } from 'client-vector-search'; 44 | 45 | // getEmbedding is an async function, so you need to use 'await' or '.then()' to get the result 46 | const embedding = await getEmbedding("Apple"); // Returns embedding as number[] 47 | 48 | // Each object should have an 'embedding' property of type number[] 49 | const initialObjects = [ 50 | { id: 1, name: "Apple", embedding: embedding }, 51 | { id: 2, name: "Banana", embedding: await getEmbedding("Banana") }, 52 | { id: 3, name: "Cheddar", embedding: await getEmbedding("Cheddar")}, 53 | { id: 4, name: "Space", embedding: await getEmbedding("Space")}, 54 | { id: 5, name: "database", embedding: await getEmbedding("database")}, 55 | ]; 56 | const index = new EmbeddingIndex(initialObjects); // Creates an index 57 | 58 | // The query should be an embedding of type number[] 59 | const queryEmbedding = await getEmbedding('Fruit'); // Query embedding 60 | const results = await index.search(queryEmbedding, { topK: 5 }); // Returns top similar objects 61 | 62 | // specify the storage type 63 | await index.saveIndex('indexedDB'); 64 | const results = await index.search([1, 2, 3], { 65 | topK: 5, 66 | useStorage: 'indexedDB', 67 | // storageOptions: { // use only if you overrode the defaults 68 | // indexedDBName: 'clientVectorDB', 69 | // indexedDBObjectStoreName: 'ClientEmbeddingStore', 70 | // }, 71 | }); 72 | 73 | console.log(results); 74 | 75 | await index.deleteIndexedDB(); // if you overrode default, specify db name 76 | ``` 77 | 78 | ## Trouble-shooting 79 | 80 | ### NextJS 81 | To use it inside NextJS projects you'll need to update the `next.config.js` file to include the following: 82 | 83 | ```js 84 | module.exports = { 85 | // Override the default webpack configuration 86 | webpack: (config) => { 87 | // See https://webpack.js.org/configuration/resolve/#resolvealias 88 | config.resolve.alias = { 89 | ...config.resolve.alias, 90 | sharp$: false, 91 | "onnxruntime-node$": false, 92 | }; 93 | return config; 94 | }, 95 | }; 96 | ``` 97 | 98 | #### Model load after page is loaded 99 | 100 | You can initialize the model before using it to generate embeddings. This will ensure that the model is loaded before you use it and provide a better UX. 101 | 102 | ```js 103 | import { initializeModel } from "client-vector-search" 104 | ... 105 | useEffect(() => { 106 | try { 107 | initializeModel(); 108 | } catch (e) { 109 | console.log(e); 110 | } 111 | }, []); 112 | ``` 113 | 114 | ## Usage Guide 115 | 116 | This guide provides a step-by-step walkthrough of the library's main features. It covers everything from generating embeddings for a string to performing operations on the index such as adding, updating, and removing objects. It also includes instructions on how to save the index to a database and perform search operations within it. 117 | 118 | Until we have a reference documentation, you can find all the methods and their usage in this guide. Each step is accompanied by a code snippet to illustrate the usage of the method in question. Make sure to follow along and try out the examples in your own environment to get a better understanding of how everything works. 119 | 120 | Let's get started! 121 | 122 | ### Step 1: Generate Embeddings for String 123 | Generate embeddings for a given string using the `getEmbedding` method. 124 | 125 | ```ts 126 | const embedding = await getEmbedding("Apple"); // Returns embedding as number[] 127 | ``` 128 | > **Note**: `getEmbedding` is asynchronous; make sure to use `await`. 129 | 130 | --- 131 | 132 | ### Step 2: Calculate Cosine Similarity 133 | Calculate the cosine similarity between two embeddings. 134 | 135 | ```ts 136 | const similarity = cosineSimilarity(embedding1, embedding2, 6); 137 | ``` 138 | > **Note**: Both embeddings should be of the same length. 139 | 140 | --- 141 | 142 | ### Step 3: Create an Index 143 | Create an index with an initial array of objects. Each object must have an 'embedding' property. 144 | 145 | ```ts 146 | const initialObjects = [...]; 147 | const index = new EmbeddingIndex(initialObjects); 148 | ``` 149 | 150 | --- 151 | 152 | ### Step 4: Add to Index 153 | Add an object to the index. 154 | 155 | ```ts 156 | const objectToAdd = { id: 6, name: 'Cat', embedding: await getEmbedding('Cat') }; 157 | index.add(objectToAdd); 158 | ``` 159 | 160 | --- 161 | 162 | ### Step 5: Update Index 163 | Update an existing object in the index. 164 | 165 | ```ts 166 | const vectorToUpdate = { id: 6, name: 'Dog', embedding: await getEmbedding('Dog') }; 167 | index.update({ id: 6 }, vectorToUpdate); 168 | ``` 169 | 170 | --- 171 | 172 | ### Step 6: Remove from Index 173 | Remove an object from the index. 174 | 175 | ```ts 176 | index.remove({ id: 6 }); 177 | ``` 178 | 179 | --- 180 | 181 | ### Step 7: Retrieve from Index 182 | Retrieve an object from the index. 183 | 184 | ```ts 185 | const vector = index.get({ id: 1 }); 186 | ``` 187 | 188 | --- 189 | 190 | ### Step 8: Search the Index 191 | Search the index with a query embedding. 192 | 193 | ```ts 194 | const queryEmbedding = await getEmbedding('Fruit'); 195 | const results = await index.search(queryEmbedding, { topK: 5 }); 196 | ``` 197 | 198 | --- 199 | 200 | ### Step 9: Print the Index 201 | Print the entire index to the console. 202 | 203 | ```ts 204 | index.printIndex(); 205 | ``` 206 | 207 | --- 208 | 209 | ### Step 10: Save Index to IndexedDB (for browser) 210 | Save the index to a persistent IndexedDB database. Note 211 | 212 | ```ts 213 | await index.saveIndex("indexedDB", { DBName: "clientVectorDB", objectStoreName:"ClientEmbeddingStore"}) 214 | ``` 215 | 216 | --- 217 | 218 | ### Important: Search in indexedDB 219 | Perform a search operation in the IndexedDB. 220 | 221 | ```ts 222 | const results = await index.search(queryEmbedding, { 223 | topK: 5, 224 | useStorage: "indexedDB", 225 | storageOptions: { // only if you want to override the default options, defaults are below 226 | indexedDBName: 'clientVectorDB', 227 | indexedDBObjectStoreName: 'ClientEmbeddingStore' 228 | } 229 | }); 230 | 231 | --- 232 | 233 | ### Delete Database 234 | To delete an entire database. 235 | 236 | ```ts 237 | await IndexedDbManager.deleteIndexedDB("clientVectorDB"); 238 | ``` 239 | 240 | --- 241 | 242 | ### Delete Object Store 243 | To delete an object store from a database. 244 | 245 | ```ts 246 | await IndexedDbManager.deleteIndexedDBObjectStore("clientVectorDB", "ClientEmbeddingStore"); 247 | ``` 248 | 249 | --- 250 | 251 | ### Retrieve All Objects 252 | To retrieve all objects from a specific object store. 253 | 254 | ```ts 255 | const allObjects = await IndexedDbManager.getAllObjectsFromIndexedDB("clientVectorDB", "ClientEmbeddingStore"); 256 | ``` 257 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "client-vector-search", 3 | "version": "0.2.0", 4 | "description": "A client side vector search library", 5 | "main": "dist/index.js", 6 | "module": "dist/index.mjs", 7 | "types": "dist/index.d.ts", 8 | "scripts": { 9 | "build": "tsup src/index.ts --format cjs,esm --dts", 10 | "dev": "tsup src/index.ts --format cjs,esm --dts --watch", 11 | "changeset": "changeset", 12 | "version": "changeset version", 13 | "release": "npm run build && changeset publish", 14 | "lint": "tsc" 15 | }, 16 | "repository": { 17 | "type": "git", 18 | "url": "git+https://github.com/yusufhilmi/client-vector-search.git" 19 | }, 20 | "keywords": [ 21 | "vector", 22 | "search", 23 | "embeddings", 24 | "nlp", 25 | "models" 26 | ], 27 | "author": "yusufhilmi", 28 | "license": "MIT", 29 | "bugs": { 30 | "url": "https://github.com/yusufhilmi/client-vector-search/issues" 31 | }, 32 | "homepage": "https://github.com/yusufhilmi/client-vector-search#readme", 33 | "devDependencies": { 34 | "@changesets/cli": "^2.26.2", 35 | "fake-indexeddb": "^4.0.2", 36 | "tsup": "^6.5.0", 37 | "typescript": "^4.9.5" 38 | }, 39 | "dependencies": { 40 | "@msgpack/msgpack": "^3.0.0-beta2", 41 | "@xenova/transformers": "^2.5.2", 42 | "lru-cache": "^10.0.1" 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/cache.ts: -------------------------------------------------------------------------------- 1 | import { LRUCache } from 'lru-cache'; 2 | 3 | class Cache { 4 | private static instance: LRUCache; 5 | 6 | private constructor() {} 7 | 8 | public static getInstance( 9 | max: number = 10000, 10 | maxAge: number = 1000 * 60 * 10, 11 | ): LRUCache { 12 | if (!Cache.instance) { 13 | const options = { 14 | max: max, 15 | length: () => 1, 16 | maxAge: maxAge, 17 | }; 18 | Cache.instance = new LRUCache(options); 19 | } 20 | return Cache.instance; 21 | } 22 | } 23 | 24 | export default Cache; 25 | -------------------------------------------------------------------------------- /src/hnsw.ts: -------------------------------------------------------------------------------- 1 | // an experimental implementation of hnsw that doesn't rely on the hnsw binding libs which only works in browser or node 2 | // TODOS: 3 | // - bare bones 4 | // - find # layers and optimal params 5 | // - test the speed, accuracy, and memory usage 6 | import { encode, decode } from '@msgpack/msgpack'; 7 | 8 | type Vector = number[]; 9 | type Distance = number; 10 | type NodeIndex = number; 11 | type Layer = LayerNode[]; 12 | 13 | interface LayerNode { 14 | vector: Vector; 15 | connections: NodeIndex[]; 16 | layerBelow: NodeIndex | null; 17 | } 18 | 19 | interface HNSWData { 20 | L: number; 21 | mL: number; 22 | efc: number; 23 | index: Layer[]; 24 | } 25 | 26 | // Simple Priority Queue Implementation 27 | class PriorityQueue { 28 | private elements: T[]; 29 | private compareFn: (a: T, b: T) => number; 30 | 31 | constructor(elements: T[], compareFn: (a: T, b: T) => number) { 32 | this.elements = elements; 33 | this.compareFn = compareFn; 34 | this.elements.sort(this.compareFn); 35 | } 36 | 37 | push(element: T) { 38 | this.elements.push(element); 39 | this.elements.sort(this.compareFn); 40 | } 41 | 42 | pop(): T | null { 43 | return this.elements.shift() || null; 44 | } 45 | 46 | isEmpty(): boolean { 47 | return this.elements.length === 0; 48 | } 49 | } 50 | 51 | const EuclideanDistance = (a: Vector, b: Vector): Distance => { 52 | if (a.length !== b.length) { 53 | throw new Error('Vectors must have the same length'); 54 | } 55 | 56 | return Math.sqrt( 57 | a.reduce((acc, val, i) => { 58 | const bVal = b[i]; // Check b[i] in a variable 59 | if (bVal === undefined) throw new Error('b[i] is undefined'); 60 | return acc + Math.pow(val - bVal, 2); 61 | }, 0), 62 | ); 63 | }; 64 | 65 | const getInsertLayer = (L: number, mL: number): number => { 66 | return Math.min(-Math.floor(Math.log(Math.random()) * mL), L - 1); 67 | }; 68 | const _searchLayer = ( 69 | graph: Layer, 70 | entry: NodeIndex, 71 | query: Vector, 72 | ef: number, 73 | ): [Distance, NodeIndex][] => { 74 | if (entry < 0 || entry >= graph.length) { 75 | throw new Error(`Invalid entry index: ${entry}`); 76 | } 77 | 78 | // Check if the graph at the entry index is defined 79 | const graphEntry = graph[entry]; 80 | if (!graphEntry) { 81 | throw new Error(`Graph entry at index ${entry} is undefined`); 82 | } 83 | 84 | const best: [Distance, NodeIndex] = [ 85 | EuclideanDistance(graphEntry.vector, query), 86 | entry, 87 | ]; 88 | const nns: [Distance, NodeIndex][] = [best]; 89 | const visited = new Set([best[1]]); 90 | const candidates = new PriorityQueue<[Distance, NodeIndex]>( 91 | [best], 92 | (a, b) => a[0] - b[0], 93 | ); 94 | 95 | while (!candidates.isEmpty()) { 96 | const current = candidates.pop(); 97 | // Define a variable to hold the last element of nns array 98 | const lastNnsElement = nns.length > 0 ? nns[nns.length - 1] : null; 99 | // Check if current is not null and lastNnsElement is not undefined before comparing their values 100 | if (!current || (lastNnsElement && lastNnsElement[0] < current[0])) break; 101 | 102 | const graphCurrent = graph[current[1]]; 103 | if (!graphCurrent) continue; 104 | 105 | for (const e of graphCurrent.connections) { 106 | const graphE = graph[e]; 107 | if (!graphE) continue; 108 | 109 | const dist = EuclideanDistance(graphE.vector, query); 110 | if (!visited.has(e)) { 111 | visited.add(e); 112 | const lastNn = nns[nns.length - 1]; 113 | if (!lastNn || dist < lastNn[0] || nns.length < ef) { 114 | candidates.push([dist, e]); 115 | nns.push([dist, e]); 116 | nns.sort((a, b) => a[0] - b[0]); 117 | if (nns.length > ef) { 118 | nns.pop(); 119 | } 120 | } 121 | } 122 | } 123 | } 124 | 125 | return nns; 126 | }; 127 | export class ExperimentalHNSWIndex { 128 | private L: number; 129 | private mL: number; 130 | private efc: number; 131 | private index: Layer[]; 132 | 133 | constructor(L = 5, mL = 0.62, efc = 10) { 134 | this.L = L; 135 | this.mL = mL; 136 | this.efc = efc; 137 | this.index = Array.from({ length: L }, () => []); 138 | } 139 | setIndex(index: Layer[]): void { 140 | this.index = index; 141 | } 142 | 143 | insert(vec: Vector) { 144 | const l = getInsertLayer(this.L, this.mL); 145 | let startV = 0; 146 | 147 | for (let n = 0; n < this.L; n++) { 148 | const graph = this.index[n]; 149 | 150 | if (graph?.length === 0) { 151 | // If the graph layer is empty, add a new node to it 152 | // Assign next layer to a variable and check if it's undefined 153 | const nextLayer = this.index[n + 1]; 154 | const nextLayerLength = nextLayer ? nextLayer.length : null; 155 | graph?.push({ 156 | vector: vec, 157 | connections: [], 158 | layerBelow: n < this.L - 1 ? nextLayerLength : null, 159 | }); 160 | continue; 161 | } 162 | 163 | if (n < l && graph) { 164 | // Check if the search layer result is not undefined before accessing its properties 165 | const searchLayerResult = _searchLayer(graph, startV, vec, 1); 166 | startV = 167 | searchLayerResult && searchLayerResult[0] 168 | ? searchLayerResult[0][1] 169 | : startV; 170 | } else if (graph) { 171 | // Assign next layer to a variable and check if it's undefined 172 | const nextLayer = this.index[n + 1]; 173 | const nextLayerLength = nextLayer ? nextLayer.length : null; 174 | const node: LayerNode = { 175 | vector: vec, 176 | connections: [], 177 | layerBelow: n < this.L - 1 ? nextLayerLength : null, 178 | }; 179 | const nns = _searchLayer(graph, startV, vec, this.efc); 180 | for (const nn of nns) { 181 | node.connections.push(nn[1]); 182 | graph[nn[1]]?.connections.push(graph.length); 183 | } 184 | graph?.push(node); 185 | // Assign graph[startV] to a variable and check if it's undefined before accessing its properties 186 | const graphStartV = graph[startV]; 187 | if (graphStartV) startV = graphStartV.layerBelow!; 188 | } 189 | } 190 | } 191 | 192 | search(query: Vector, ef = 1): [Distance, NodeIndex][] { 193 | if (this.index && this.index[0] && this.index[0].length === 0) { 194 | return []; 195 | } 196 | 197 | let bestV = 0; 198 | for (const graph of this.index) { 199 | const searchLayer = _searchLayer(graph, bestV, query, ef); 200 | if (searchLayer && searchLayer[0]) { 201 | bestV = searchLayer[0][1]; 202 | if (graph[bestV]?.layerBelow === null) { 203 | return _searchLayer(graph, bestV, query, ef); 204 | } 205 | bestV = graph[bestV]?.layerBelow!; 206 | } 207 | } 208 | return []; 209 | } 210 | 211 | toJSON() { 212 | return { 213 | L: this.L, 214 | mL: this.mL, 215 | efc: this.efc, 216 | index: this.index, 217 | }; 218 | } 219 | 220 | static fromJSON(json: any): ExperimentalHNSWIndex { 221 | const hnsw = new ExperimentalHNSWIndex(json.L, json.mL, json.efc); 222 | return hnsw; 223 | } 224 | 225 | toBinary() { 226 | return encode({ 227 | L: this.L, 228 | mL: this.mL, 229 | efc: this.efc, 230 | index: this.index, 231 | }); 232 | } 233 | 234 | static fromBinary(binary: Uint8Array): ExperimentalHNSWIndex { 235 | const data = decode(binary) as HNSWData; 236 | const hnsw = new ExperimentalHNSWIndex(data.L, data.mL, data.efc); 237 | hnsw.setIndex(data.index); 238 | return hnsw; 239 | } 240 | } 241 | -------------------------------------------------------------------------------- /src/index.ts: -------------------------------------------------------------------------------- 1 | const DEFAULT_TOP_K = 3; 2 | 3 | interface Filter { 4 | [key: string]: any; 5 | } 6 | 7 | import Cache from './cache'; 8 | import { IndexedDbManager } from './indexedDB'; 9 | import { cosineSimilarity } from './utils'; 10 | export { ExperimentalHNSWIndex } from './hnsw'; 11 | 12 | // uncomment if you want to test indexedDB implementation in node env for faster dev cycle 13 | // import { IDBFactory } from 'fake-indexeddb'; 14 | // const indexedDB = new IDBFactory(); 15 | 16 | export interface SearchResult { 17 | similarity: number; 18 | object: any; 19 | } 20 | 21 | type StorageOptions = 'indexedDB' | 'localStorage' | 'none'; 22 | 23 | /** 24 | * Interface for search options in the EmbeddingIndex class. 25 | * topK: The number of top similar items to return. 26 | * filter: An optional filter to apply to the objects before searching. 27 | * useStorage: A flag to indicate whether to use storage options like indexedDB or localStorage. 28 | */ 29 | interface SearchOptions { 30 | topK?: number; 31 | filter?: Filter; 32 | useStorage?: StorageOptions; 33 | storageOptions?: { indexedDBName: string; indexedDBObjectStoreName: string }; // TODO: generalize it to localStorage as well 34 | } 35 | 36 | const cacheInstance = Cache.getInstance(); 37 | 38 | let pipe: any; 39 | let currentModel: string; 40 | 41 | export const initializeModel = async ( 42 | model: string = 'Xenova/gte-small', 43 | ): Promise => { 44 | if (model !== currentModel) { 45 | const transformersModule = await import('@xenova/transformers'); 46 | const pipeline = transformersModule.pipeline; 47 | pipe = await pipeline('feature-extraction', model); 48 | currentModel = model; 49 | } 50 | }; 51 | 52 | export const getEmbedding = async ( 53 | text: string, 54 | precision: number = 7, 55 | options = { pooling: 'mean', normalize: false }, 56 | model = 'Xenova/gte-small', 57 | ): Promise => { 58 | const cachedEmbedding = cacheInstance.get(text); 59 | if (cachedEmbedding) { 60 | return Promise.resolve(cachedEmbedding); 61 | } 62 | 63 | if (model !== currentModel) { 64 | await initializeModel(model); 65 | } 66 | 67 | const output = await pipe(text, options); 68 | const roundedOutput = Array.from(output.data as number[]).map( 69 | (value: number) => parseFloat(value.toFixed(precision)), 70 | ); 71 | cacheInstance.set(text, roundedOutput); 72 | return Array.from(roundedOutput); 73 | }; 74 | 75 | export class EmbeddingIndex { 76 | private objects: Filter[]; 77 | private keys: string[]; 78 | 79 | constructor(initialObjects?: Filter[]) { 80 | // TODO: add support for options while creating index such as {... indexedDB: true, ...} 81 | this.objects = []; 82 | this.keys = []; 83 | if (initialObjects && initialObjects.length > 0) { 84 | initialObjects.forEach((obj) => this.validateAndAdd(obj)); 85 | if (initialObjects[0]) { 86 | this.keys = Object.keys(initialObjects[0]); 87 | } 88 | } 89 | } 90 | 91 | private findVectorIndex(filter: Filter): number { 92 | return this.objects.findIndex((object) => 93 | Object.keys(filter).every((key) => object[key] === filter[key]), 94 | ); 95 | } 96 | 97 | private validateAndAdd(obj: Filter) { 98 | if (!Array.isArray(obj.embedding) || obj.embedding.some(isNaN)) { 99 | throw new Error( 100 | 'Object must have an embedding property of type number[]', 101 | ); 102 | } 103 | if (this.keys.length === 0) { 104 | this.keys = Object.keys(obj); 105 | } else if (!this.keys.every((key) => key in obj)) { 106 | throw new Error( 107 | 'Object must have the same properties as the initial objects', 108 | ); 109 | } 110 | this.objects.push(obj); 111 | } 112 | 113 | add(obj: Filter) { 114 | this.validateAndAdd(obj); 115 | } 116 | 117 | // Method to update an existing vector in the index 118 | update(filter: Filter, vector: Filter) { 119 | const index = this.findVectorIndex(filter); 120 | if (index === -1) { 121 | throw new Error('Vector not found'); 122 | } 123 | if (vector.hasOwnProperty('embedding')) { 124 | // Validate and add the new vector 125 | this.validateAndAdd(vector); 126 | } 127 | // Replace the old vector with the new one 128 | this.objects[index] = Object.assign(this.objects[index] as Filter, vector); 129 | } 130 | 131 | // Method to remove a vector from the index 132 | remove(filter: Filter) { 133 | const index = this.findVectorIndex(filter); 134 | if (index === -1) { 135 | throw new Error('Vector not found'); 136 | } 137 | // Remove the vector from the index 138 | this.objects.splice(index, 1); 139 | } 140 | 141 | // Method to remove multiple vectors from the index 142 | removeBatch(filters: Filter[]) { 143 | filters.forEach((filter) => { 144 | const index = this.findVectorIndex(filter); 145 | if (index !== -1) { 146 | // Remove the vector from the index 147 | this.objects.splice(index, 1); 148 | } 149 | }); 150 | } 151 | 152 | // Method to retrieve a vector from the index 153 | get(filter: Filter) { 154 | const vector = this.objects[this.findVectorIndex(filter)]; 155 | return vector || null; 156 | } 157 | 158 | size(): number { 159 | // Returns the size of the index 160 | return this.objects.length; 161 | } 162 | 163 | clear() { 164 | this.objects = []; 165 | } 166 | 167 | async search( 168 | queryEmbedding: number[], 169 | options: SearchOptions = { 170 | topK: 3, 171 | useStorage: 'none', 172 | storageOptions: { 173 | indexedDBName: 'clientVectorDB', 174 | indexedDBObjectStoreName: 'ClientEmbeddingStore', 175 | }, 176 | }, 177 | ): Promise { 178 | const topK = options.topK || DEFAULT_TOP_K; 179 | const filter = options.filter || {}; 180 | const useStorage = options.useStorage || 'none'; 181 | 182 | if (useStorage === 'indexedDB') { 183 | const DBname = options.storageOptions?.indexedDBName || 'clientVectorDB'; 184 | const objectStoreName = 185 | options.storageOptions?.indexedDBObjectStoreName || 186 | 'ClientEmbeddingStore'; 187 | 188 | if (typeof indexedDB === 'undefined') { 189 | console.error('IndexedDB is not supported'); 190 | throw new Error('IndexedDB is not supported'); 191 | } 192 | const results = await this.loadAndSearchFromIndexedDB( 193 | DBname, 194 | objectStoreName, 195 | queryEmbedding, 196 | topK, 197 | filter, 198 | ); 199 | return results; 200 | } else { 201 | // Compute similarities 202 | const similarities = this.objects 203 | .filter((object) => 204 | Object.keys(filter).every((key) => object[key] === filter[key]), 205 | ) 206 | .map((obj) => ({ 207 | similarity: cosineSimilarity(queryEmbedding, obj.embedding), 208 | object: obj, 209 | })); 210 | 211 | // Sort by similarity and return topK results 212 | return similarities 213 | .sort((a, b) => b.similarity - a.similarity) 214 | .slice(0, topK); 215 | } 216 | } 217 | 218 | printIndex() { 219 | console.log('Index Content:'); 220 | this.objects.forEach((obj, idx) => { 221 | console.log(`Item ${idx + 1}:`, obj); 222 | }); 223 | } 224 | 225 | async saveIndex( 226 | storageType: string, 227 | options: { DBName: string; objectStoreName: string } = { 228 | DBName: 'clientVectorDB', 229 | objectStoreName: 'ClientEmbeddingStore', 230 | }, 231 | ) { 232 | if (storageType === 'indexedDB') { 233 | await this.saveToIndexedDB(options.DBName, options.objectStoreName); 234 | } else { 235 | throw new Error( 236 | `Unsupported storage type: ${storageType} \n Supported storage types: "indexedDB"`, 237 | ); 238 | } 239 | } 240 | 241 | async saveToIndexedDB( 242 | DBname: string = 'clientVectorDB', 243 | objectStoreName: string = 'ClientEmbeddingStore', 244 | ): Promise { 245 | if (typeof indexedDB === 'undefined') { 246 | console.error('IndexedDB is not defined'); 247 | throw new Error('IndexedDB is not supported'); 248 | } 249 | 250 | if (!this.objects || this.objects.length === 0) { 251 | throw new Error('Index is empty. Nothing to save'); 252 | } 253 | 254 | try { 255 | const db = await IndexedDbManager.create(DBname, objectStoreName); 256 | await db.addToIndexedDB(this.objects); 257 | console.log( 258 | `Index saved to database '${DBname}' object store '${objectStoreName}'`, 259 | ); 260 | } catch (error) { 261 | console.error('Error saving index to database:', error); 262 | throw new Error('Error saving index to database'); 263 | } 264 | } 265 | 266 | async loadAndSearchFromIndexedDB( 267 | DBname: string = 'clientVectorDB', 268 | objectStoreName: string = 'ClientEmbeddingStore', 269 | queryEmbedding: number[], 270 | topK: number, 271 | filter: { [key: string]: any }, 272 | ): Promise { 273 | const db = await IndexedDbManager.create(DBname, objectStoreName); 274 | const generator = db.dbGenerator(); 275 | const results: { similarity: number; object: any }[] = []; 276 | 277 | for await (const record of generator) { 278 | if (Object.keys(filter).every((key) => record[key] === filter[key])) { 279 | const similarity = cosineSimilarity(queryEmbedding, record.embedding); 280 | results.push({ similarity, object: record }); 281 | } 282 | } 283 | results.sort((a, b) => b.similarity - a.similarity); 284 | return results.slice(0, topK); 285 | } 286 | 287 | async deleteIndexedDB(DBname: string = 'clientVectorDB'): Promise { 288 | if (typeof indexedDB === 'undefined') { 289 | console.error('IndexedDB is not defined'); 290 | throw new Error('IndexedDB is not supported'); 291 | } 292 | return new Promise((resolve, reject) => { 293 | const request = indexedDB.deleteDatabase(DBname); 294 | 295 | request.onsuccess = () => { 296 | console.log(`Database '${DBname}' deleted`); 297 | resolve(); 298 | }; 299 | request.onerror = (event) => { 300 | console.error('Failed to delete database', event); 301 | reject(new Error('Failed to delete database')); 302 | }; 303 | }); 304 | } 305 | 306 | async deleteIndexedDBObjectStore( 307 | DBname: string = 'clientVectorDB', 308 | objectStoreName: string = 'ClientEmbeddingStore', 309 | ): Promise { 310 | const db = await IndexedDbManager.create(DBname, objectStoreName); 311 | 312 | try { 313 | await db.deleteIndexedDBObjectStoreFromDB(DBname, objectStoreName); 314 | console.log( 315 | `Object store '${objectStoreName}' deleted from database '${DBname}'`, 316 | ); 317 | } catch (error) { 318 | console.error('Error deleting object store:', error); 319 | throw new Error('Error deleting object store'); 320 | } 321 | } 322 | 323 | async getAllObjectsFromIndexedDB( 324 | DBname: string = 'clientVectorDB', 325 | objectStoreName: string = 'ClientEmbeddingStore', 326 | ): Promise { 327 | const db = await IndexedDbManager.create(DBname, objectStoreName); 328 | const objects: any[] = []; 329 | for await (const record of db.dbGenerator()) { 330 | objects.push(record); 331 | } 332 | return objects; 333 | } 334 | } 335 | -------------------------------------------------------------------------------- /src/indexedDB.ts: -------------------------------------------------------------------------------- 1 | // uncomment for testing only 2 | // import { IDBFactory } from 'fake-indexeddb'; 3 | // const indexedDB = new IDBFactory(); 4 | 5 | export class IndexedDbManager { 6 | private DBname!: string; 7 | private objectStoreName!: string; 8 | 9 | constructor(DBname: string, objectStoreName: string) { 10 | this.DBname = DBname; 11 | this.objectStoreName = objectStoreName; 12 | } 13 | 14 | static async create( 15 | DBname: string = 'clientVectorDB', 16 | objectStoreName: string = 'ClientEmbeddingStore', 17 | index: string | null = null, 18 | ): Promise { 19 | const instance = new IndexedDbManager(DBname, objectStoreName); 20 | return new Promise((resolve, reject) => { 21 | const request = indexedDB.open(DBname); 22 | let db: IDBDatabase; 23 | 24 | request.onerror = (event) => { 25 | console.error('IndexedDB error:', event); 26 | reject(new Error('Database initialization failed')); 27 | }; 28 | 29 | request.onsuccess = async () => { 30 | db = request.result; 31 | if (!db.objectStoreNames.contains(objectStoreName)) { 32 | db.close(); 33 | await instance.createObjectStore(index); 34 | } 35 | db.close(); 36 | resolve(instance); 37 | }; 38 | }); 39 | } 40 | 41 | async createObjectStore(index: string | null = null): Promise { 42 | return new Promise((resolve, reject) => { 43 | const request = indexedDB.open(this.DBname); 44 | request.onsuccess = () => { 45 | let db1 = request.result; 46 | var version = db1.version; 47 | db1.close(); 48 | const request_2 = indexedDB.open(this.DBname, version + 1); 49 | request_2.onupgradeneeded = async () => { 50 | let db2 = request_2.result; 51 | if (!db2.objectStoreNames.contains(this.objectStoreName)) { 52 | const objectStore = db2.createObjectStore(this.objectStoreName, { 53 | autoIncrement: true, 54 | }); 55 | if (index) { 56 | objectStore.createIndex(`by_${index}`, index, { unique: false }); 57 | } 58 | } 59 | }; 60 | request_2.onsuccess = async () => { 61 | let db2 = request_2.result; 62 | console.log('Object store creation successful'); 63 | db2.close(); 64 | resolve(); 65 | }; 66 | request_2.onerror = (event) => { 67 | console.error('Error creating object store:', event); 68 | reject(new Error('Error creating object store')); 69 | }; 70 | }; 71 | request.onerror = (event) => { 72 | console.error('Error opening database:', event); 73 | reject(new Error('Error opening database')); 74 | }; 75 | }); 76 | } 77 | 78 | async addToIndexedDB( 79 | objs: { [key: string]: any }[] | { [key: string]: any }, 80 | ): Promise { 81 | return new Promise(async (resolve, reject) => { 82 | const request = indexedDB.open(this.DBname); 83 | 84 | request.onsuccess = async () => { 85 | let db = request.result; 86 | const transaction = db.transaction([this.objectStoreName], 'readwrite'); 87 | const objectStore = transaction.objectStore(this.objectStoreName); 88 | 89 | if (!Array.isArray(objs)) { 90 | objs = [objs]; 91 | } 92 | 93 | objs.forEach((obj: { [key: string]: any }) => { 94 | const request = objectStore.add(obj); 95 | 96 | request.onerror = (event) => { 97 | console.error('Failed to add object', event); 98 | throw new Error('Failed to add object'); 99 | }; 100 | }); 101 | 102 | transaction.oncomplete = () => { 103 | resolve(); 104 | }; 105 | 106 | transaction.onerror = (event) => { 107 | console.error('Failed to add object', event); 108 | reject(new Error('Failed to add object')); 109 | }; 110 | db.close(); 111 | }; 112 | }); 113 | } 114 | 115 | async *dbGenerator(): AsyncGenerator { 116 | const objectStoreName = this.objectStoreName; 117 | const dbOpenPromise = new Promise((resolve, reject) => { 118 | const request = indexedDB.open(this.DBname); 119 | request.onsuccess = () => { 120 | resolve(request.result); 121 | }; 122 | request.onerror = () => { 123 | reject(new Error('Could not open DB')); 124 | }; 125 | }); 126 | 127 | try { 128 | const db = await dbOpenPromise; 129 | const transaction = db.transaction([objectStoreName], 'readonly'); 130 | const objectStore = transaction.objectStore(objectStoreName); 131 | const request = objectStore.openCursor(); 132 | 133 | let promiseResolver: (value: any) => void; 134 | 135 | request.onsuccess = function (event: Event) { 136 | const cursor = (event.target as IDBRequest).result; 137 | if (cursor) { 138 | promiseResolver(cursor.value); 139 | cursor.continue(); 140 | } else { 141 | promiseResolver(null); 142 | } 143 | }; 144 | 145 | while (true) { 146 | const promise = new Promise((resolve) => { 147 | promiseResolver = resolve; 148 | }); 149 | const value = await promise; 150 | if (value === null) break; 151 | yield value; 152 | } 153 | 154 | db.close(); 155 | } catch (error) { 156 | console.error('An error occurred:', error); 157 | } 158 | } 159 | async deleteIndexedDBObjectStoreFromDB( 160 | DBname: string, 161 | objectStoreName: string, 162 | ): Promise { 163 | return new Promise(async (resolve, reject) => { 164 | const request = indexedDB.open(this.DBname); 165 | 166 | request.onsuccess = async () => { 167 | let db = request.result; 168 | var version = db.version; 169 | db.close(); 170 | const request_2 = indexedDB.open(db.name, version + 1); 171 | request_2.onupgradeneeded = async () => { 172 | let db2 = request_2.result; 173 | if (db2.objectStoreNames.contains(objectStoreName)) { 174 | db2.deleteObjectStore(objectStoreName); 175 | } else { 176 | console.error( 177 | `Object store '${objectStoreName}' not found in database '${DBname}'`, 178 | ); 179 | reject( 180 | new Error( 181 | `Object store '${objectStoreName}' not found in database '${DBname}'`, 182 | ), 183 | ); 184 | } 185 | }; 186 | request_2.onsuccess = () => { 187 | let db2 = request_2.result; 188 | console.log('Object store deletion successful'); 189 | db2.close(); 190 | resolve(); 191 | }; 192 | request_2.onerror = (event) => { 193 | console.error('Failed to delete object store', event); 194 | let db2 = request_2.result; 195 | db2.close(); 196 | reject(new Error('Failed to delete object store')); 197 | }; 198 | }; 199 | request.onerror = (event) => { 200 | console.error('Failed to open database', event); 201 | reject(new Error('Failed to open database')); 202 | }; 203 | }); 204 | } 205 | } 206 | -------------------------------------------------------------------------------- /src/utils.ts: -------------------------------------------------------------------------------- 1 | export const cosineSimilarity = ( 2 | vecA: number[], 3 | vecB: number[], 4 | precision: number = 6, 5 | ): number => { 6 | // Check if both vectors have the same length 7 | if (vecA.length !== vecB.length) { 8 | throw new Error('Vectors must have the same length'); 9 | } 10 | 11 | // Compute dot product and magnitudes 12 | const dotProduct = vecA.reduce((sum, a, i) => { 13 | const b = vecB[i]; // Extract value safely 14 | return sum + a * (b !== undefined ? b : 0); // Check for undefined 15 | }, 0); 16 | const magnitudeA = Math.sqrt(vecA.reduce((sum, a) => sum + a * a, 0)); 17 | const magnitudeB = Math.sqrt(vecB.reduce((sum, b) => sum + b * b, 0)); 18 | 19 | // Check if either magnitude is zero 20 | if (magnitudeA === 0 || magnitudeB === 0) { 21 | return 0; 22 | } 23 | 24 | // Calculate cosine similarity and round to specified precision 25 | return parseFloat( 26 | (dotProduct / (magnitudeA * magnitudeB)).toFixed(precision), 27 | ); 28 | }; 29 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "ESNext", 4 | "lib": ["ESNext", "DOM"], 5 | "module": "esnext", 6 | "rootDir": "./src", 7 | "moduleResolution": "node", 8 | "baseUrl": "./", 9 | "resolveJsonModule": true, 10 | "allowJs": true, 11 | "declaration": true, 12 | "declarationMap": true, 13 | "sourceMap": true, 14 | "outDir": "./dist", 15 | "noUnusedParameters": true, 16 | "noUnusedLocals": true, 17 | // "target": "es2016", /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */ 18 | // "module": "commonjs", /* Specify what module code is generated. */ 19 | "esModuleInterop": true, /* Emit additional JavaScript to ease support for importing CommonJS modules. This enables 'allowSyntheticDefaultImports' for type compatibility. */ 20 | "forceConsistentCasingInFileNames": true, /* Ensure that casing is correct in imports. */ 21 | "strict": true, /* Enable all strict type-checking options. */ 22 | "skipLibCheck": true, /* Skip type checking all .d.ts files. */ 23 | "noUncheckedIndexedAccess": true, 24 | "noEmit": true 25 | } 26 | } 27 | --------------------------------------------------------------------------------