├── src ├── .vs │ ├── SharpVector │ │ ├── FileContentIndex │ │ │ ├── read.lock │ │ │ ├── 1f76313d-a8de-47ce-81a3-ac1fc7438030.vsidx │ │ │ ├── a904092f-4585-40f5-9f93-3556202611e1.vsidx │ │ │ ├── e534ca41-141b-4115-9099-c4b3a40cc99e.vsidx │ │ │ └── ece43a1c-28dc-443c-bc82-afc0d35267af.vsidx │ │ └── v17 │ │ │ ├── TestStore │ │ │ └── 0 │ │ │ │ ├── 000.testlog │ │ │ │ └── testlog.manifest │ │ │ ├── .suo │ │ │ └── .futdcache.v2 │ └── ProjectEvaluation │ │ ├── sharpvector.metadata.v6.1 │ │ └── sharpvector.projects.v6.1 ├── run.sh ├── build-release.sh ├── Build5Nines.SharpVector │ ├── VectorComparison.cs │ ├── Embeddings │ │ ├── IEmbeddingsGenerator.cs │ │ └── IBatchEmbeddingsGenerator.cs │ ├── Id │ │ ├── GuidIdGenerator.cs │ │ ├── IIdGenerator.cs │ │ ├── IntIdGenerator.cs │ │ ├── ISequentialIdGenerator.cs │ │ └── NumericIdGenerator.cs │ ├── Preprocessing │ │ ├── ITextPreprocessor.cs │ │ └── BasicTextPreprocessor.cs │ ├── BasicMemoryVectorDatabase.cs │ ├── Data │ │ ├── TextChunkingMethod.cs │ │ ├── TextChunkingOptions.cs │ │ └── TextDataLoader.cs │ ├── DatabaseInfo.cs │ ├── VectorTextDatabaseItem.cs │ ├── VectorStore │ │ ├── IVectorStoreWithVocabulary.cs │ │ ├── MemoryDictionaryVectorStoreWithVocabulary.cs │ │ ├── IVectorStore.cs │ │ └── MemoryDictionaryVectorStore.cs │ ├── docs │ │ ├── LICENSE │ │ └── README.md │ ├── Build5Nines.SharpVector.csproj │ ├── Vocabulary │ │ ├── IVocabularyStore.cs │ │ └── DictionaryVocabularyStore.cs │ ├── BasicDiskMemoryVectorDatabaseBase.cs │ ├── Vectorization │ │ ├── IVectorizer.cs │ │ └── BagOfWordsVectorizer.cs │ ├── VectorTextItem.cs │ ├── IVectorDatabaseExtensions.cs │ ├── BasicDiskVectorDatabase.cs │ ├── VectorCompare │ │ ├── IVectorComparer.cs │ │ ├── EuclideanDistanceVectorComparerAsync.cs │ │ └── CosineSimilarityVectorComparerAsync.cs │ ├── DatabaseFileException.cs │ ├── MemoryVectorDatabase.cs │ ├── MemoryVectorDatabaseBase.cs │ ├── VectorTextResult.cs │ └── VectorTextResultItem.cs ├── Build5Nines.SharpVector.Playground │ ├── wwwroot │ │ ├── favicon.png │ │ └── app.css │ ├── appsettings.Development.json │ ├── appsettings.json │ ├── Components │ │ ├── Routes.razor │ │ ├── _Imports.razor │ │ ├── Layout │ │ │ ├── MainLayout.razor │ │ │ ├── NavMenu.razor │ │ │ ├── MainLayout.razor.css │ │ │ └── NavMenu.razor.css │ │ ├── App.razor │ │ └── Pages │ │ │ └── Error.razor │ ├── Build5Nines.SharpVector.Playground.csproj │ ├── Program.cs │ └── Properties │ │ └── launchSettings.json ├── SharpVectorTest │ ├── Regression │ │ ├── regression-vector-database-v2.0.2.b59vdb │ │ └── RegressionTests.cs │ ├── SharpVectorTest.csproj │ ├── BatchAddTests.cs │ ├── DiskVectorDatabaseTests.cs │ ├── VectorStore │ │ └── MemoryDictionaryVectorStoreTest.cs │ └── Preprocessing │ │ └── BasicTextPreprocessorTests.cs ├── SharpVectorOpenAITest │ ├── UnitTest1.cs │ └── SharpVectorOpenAITest.csproj ├── SharpVectorPerformance │ ├── Program.cs │ ├── SharpVectorPerformance.csproj │ └── DiskVectorDatabasePerformance.cs ├── Build5Nines.SharpVector.Ollama │ ├── docs │ │ ├── README.md │ │ └── LICENSE │ ├── BasicOllamaMemoryVectorDatabase.cs │ ├── Build5Nines.SharpVector.Ollama.csproj │ ├── OllamaMemoryVectorDatabase.cs │ └── Embeddings │ │ └── OllamaEmbeddingsGenerator.cs ├── ConsoleTest │ └── ConsoleTest.csproj ├── OllamaConsoleTest │ ├── OllamaConsoleTest.csproj │ └── Program.cs ├── Build5Nines.SharpVector.OpenAI │ ├── BasicOpenAIMemoryVectorDatabase.cs │ ├── docs │ │ ├── README.md │ │ └── LICENSE │ ├── OpenAIMemoryVectorDatabase.cs │ ├── Build5Nines.SharpVector.OpenAI.csproj │ └── Embeddings │ │ └── OpenAIEmbeddingsGenerator.cs ├── OpenAIConsoleTest │ ├── OpenAIConsoleTest.csproj │ └── Program.cs ├── .vscode │ ├── tasks.json │ └── launch.json └── SharpVector.sln ├── docs ├── docs │ ├── images │ │ ├── logo.png │ │ ├── favicon.png │ │ └── samples │ │ │ └── build5nines-sharpvector-console-screenshot.jpg │ ├── embeddings │ │ ├── index.md │ │ ├── ollama │ │ │ └── index.md │ │ └── openai │ │ │ └── index.md │ ├── license │ │ └── index.md │ ├── samples │ │ └── index.md │ ├── resources │ │ └── index.md │ ├── get-started │ │ ├── data-management │ │ │ └── index.md │ │ ├── index.md │ │ ├── search │ │ │ └── index.md │ │ └── metadata │ │ │ └── index.md │ └── persistence │ │ └── index.md ├── update-theme.sh ├── requirements.txt └── overrides │ └── main.html ├── .gitignore ├── assets ├── build5nines-sharpvector-console-screenshot.jpg ├── github-opengraph-build5nines-sharpvector-dark.jpg └── github-opengraph-build5nines-sharpvector-light.jpg ├── samples ├── azure │ └── document-intelligence │ │ └── b59-azure-doc-intelligence │ │ ├── document.pdf │ │ ├── b59-azure-doc-intelligence.csproj │ │ └── Program.cs └── genai-rag-onnx │ └── genai-rag-onnx.csproj ├── LICENSE ├── .github └── workflows │ ├── build-release.yml │ ├── ghpages-mkdocs.yml │ └── mkdocs-build.yml ├── README.md └── CODE_OF_CONDUCT.md /src/.vs/SharpVector/FileContentIndex/read.lock: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/run.sh: -------------------------------------------------------------------------------- 1 | dotnet run --project "ConsoleTest" -------------------------------------------------------------------------------- /src/build-release.sh: -------------------------------------------------------------------------------- 1 | dotnet build --configuration Release -------------------------------------------------------------------------------- /src/.vs/SharpVector/v17/TestStore/0/000.testlog: -------------------------------------------------------------------------------- 1 | !!tItseT -------------------------------------------------------------------------------- /docs/docs/images/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Build5Nines/SharpVector/HEAD/docs/docs/images/logo.png -------------------------------------------------------------------------------- /docs/docs/images/favicon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Build5Nines/SharpVector/HEAD/docs/docs/images/favicon.png -------------------------------------------------------------------------------- /src/.vs/SharpVector/v17/.suo: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Build5Nines/SharpVector/HEAD/src/.vs/SharpVector/v17/.suo -------------------------------------------------------------------------------- /src/.vs/SharpVector/v17/.futdcache.v2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Build5Nines/SharpVector/HEAD/src/.vs/SharpVector/v17/.futdcache.v2 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | obj 3 | bin 4 | 5 | .DS_Store 6 | 7 | BenchmarkDotNet.Artifacts/ 8 | TestResults/ 9 | 10 | docs/site 11 | .cache 12 | -------------------------------------------------------------------------------- /src/.vs/ProjectEvaluation/sharpvector.metadata.v6.1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Build5Nines/SharpVector/HEAD/src/.vs/ProjectEvaluation/sharpvector.metadata.v6.1 -------------------------------------------------------------------------------- /src/.vs/ProjectEvaluation/sharpvector.projects.v6.1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Build5Nines/SharpVector/HEAD/src/.vs/ProjectEvaluation/sharpvector.projects.v6.1 -------------------------------------------------------------------------------- /src/.vs/SharpVector/v17/TestStore/0/testlog.manifest: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Build5Nines/SharpVector/HEAD/src/.vs/SharpVector/v17/TestStore/0/testlog.manifest -------------------------------------------------------------------------------- /assets/build5nines-sharpvector-console-screenshot.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Build5Nines/SharpVector/HEAD/assets/build5nines-sharpvector-console-screenshot.jpg -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector/VectorComparison.cs: -------------------------------------------------------------------------------- 1 | namespace Build5Nines.SharpVector; 2 | 3 | //public record VectorComparison(TId Id, float vectorComparison); 4 | -------------------------------------------------------------------------------- /assets/github-opengraph-build5nines-sharpvector-dark.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Build5Nines/SharpVector/HEAD/assets/github-opengraph-build5nines-sharpvector-dark.jpg -------------------------------------------------------------------------------- /assets/github-opengraph-build5nines-sharpvector-light.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Build5Nines/SharpVector/HEAD/assets/github-opengraph-build5nines-sharpvector-light.jpg -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector.Playground/wwwroot/favicon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Build5Nines/SharpVector/HEAD/src/Build5Nines.SharpVector.Playground/wwwroot/favicon.png -------------------------------------------------------------------------------- /docs/docs/images/samples/build5nines-sharpvector-console-screenshot.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Build5Nines/SharpVector/HEAD/docs/docs/images/samples/build5nines-sharpvector-console-screenshot.jpg -------------------------------------------------------------------------------- /src/SharpVectorTest/Regression/regression-vector-database-v2.0.2.b59vdb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Build5Nines/SharpVector/HEAD/src/SharpVectorTest/Regression/regression-vector-database-v2.0.2.b59vdb -------------------------------------------------------------------------------- /samples/azure/document-intelligence/b59-azure-doc-intelligence/document.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Build5Nines/SharpVector/HEAD/samples/azure/document-intelligence/b59-azure-doc-intelligence/document.pdf -------------------------------------------------------------------------------- /docs/update-theme.sh: -------------------------------------------------------------------------------- 1 | rm -rf ./themes/material 2 | 3 | git clone https://github.com/yakworks/docmark.git ./themes/.github 4 | 5 | cp -r ./themes/.github/material ./themes/material 6 | 7 | rm -rf ./themes/.github 8 | -------------------------------------------------------------------------------- /src/.vs/SharpVector/FileContentIndex/1f76313d-a8de-47ce-81a3-ac1fc7438030.vsidx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Build5Nines/SharpVector/HEAD/src/.vs/SharpVector/FileContentIndex/1f76313d-a8de-47ce-81a3-ac1fc7438030.vsidx -------------------------------------------------------------------------------- /src/.vs/SharpVector/FileContentIndex/a904092f-4585-40f5-9f93-3556202611e1.vsidx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Build5Nines/SharpVector/HEAD/src/.vs/SharpVector/FileContentIndex/a904092f-4585-40f5-9f93-3556202611e1.vsidx -------------------------------------------------------------------------------- /src/.vs/SharpVector/FileContentIndex/e534ca41-141b-4115-9099-c4b3a40cc99e.vsidx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Build5Nines/SharpVector/HEAD/src/.vs/SharpVector/FileContentIndex/e534ca41-141b-4115-9099-c4b3a40cc99e.vsidx -------------------------------------------------------------------------------- /src/.vs/SharpVector/FileContentIndex/ece43a1c-28dc-443c-bc82-afc0d35267af.vsidx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Build5Nines/SharpVector/HEAD/src/.vs/SharpVector/FileContentIndex/ece43a1c-28dc-443c-bc82-afc0d35267af.vsidx -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | mkdocs 2 | mkdocs-material 3 | mkdocs-material[imaging] 4 | pymdown-extensions 5 | markdown-include 6 | mkdocs-git-committers-plugin-2 7 | mkdocs-git-revision-date-localized-plugin 8 | mkdocs-with-pdf 9 | -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector.Playground/appsettings.Development.json: -------------------------------------------------------------------------------- 1 | { 2 | "Logging": { 3 | "LogLevel": { 4 | "Default": "Information", 5 | "Microsoft.AspNetCore": "Warning" 6 | } 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector/Embeddings/IEmbeddingsGenerator.cs: -------------------------------------------------------------------------------- 1 | namespace Build5Nines.SharpVector.Embeddings; 2 | 3 | public interface IEmbeddingsGenerator 4 | { 5 | Task GenerateEmbeddingsAsync(string text); 6 | } -------------------------------------------------------------------------------- /src/SharpVectorOpenAITest/UnitTest1.cs: -------------------------------------------------------------------------------- 1 | namespace SharpVectorOpenAITest; 2 | 3 | [TestClass] 4 | public class UnitTest1 5 | { 6 | [TestMethod] 7 | public void TestMethod1() 8 | { 9 | } 10 | } -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector/Id/GuidIdGenerator.cs: -------------------------------------------------------------------------------- 1 | namespace Build5Nines.SharpVector.Id; 2 | 3 | public class GuidIdGenerator : IIdGenerator 4 | { 5 | public Guid NewId() 6 | { 7 | return Guid.NewGuid(); 8 | } 9 | } -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector.Playground/appsettings.json: -------------------------------------------------------------------------------- 1 | { 2 | "Logging": { 3 | "LogLevel": { 4 | "Default": "Information", 5 | "Microsoft.AspNetCore": "Warning" 6 | } 7 | }, 8 | "AllowedHosts": "*" 9 | } 10 | -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector/Id/IIdGenerator.cs: -------------------------------------------------------------------------------- 1 | namespace Build5Nines.SharpVector.Id; 2 | 3 | public interface IIdGenerator 4 | where TId : notnull 5 | { 6 | /// 7 | /// Generates a new ID. 8 | /// 9 | /// 10 | TId NewId(); 11 | } -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector/Id/IntIdGenerator.cs: -------------------------------------------------------------------------------- 1 | namespace Build5Nines.SharpVector.Id; 2 | 3 | public class IntIdGenerator : NumericIdGenerator 4 | { 5 | public IntIdGenerator() : base() 6 | { } 7 | 8 | public IntIdGenerator(int mostRecentId) : base(mostRecentId) 9 | { } 10 | } 11 | -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector/Preprocessing/ITextPreprocessor.cs: -------------------------------------------------------------------------------- 1 | namespace Build5Nines.SharpVector.Preprocessing; 2 | 3 | public interface ITextPreprocessor 4 | { 5 | IEnumerable TokenizeAndPreprocess(TToken text); 6 | Task> TokenizeAndPreprocessAsync(TToken text); 7 | } 8 | -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector.Playground/Components/Routes.razor: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector/BasicMemoryVectorDatabase.cs: -------------------------------------------------------------------------------- 1 | namespace Build5Nines.SharpVector; 2 | 3 | /// 4 | /// A basic implementation of an vector database that uses an in-memory dictionary to store vectors, with integer keys and string metadata values. 5 | /// 6 | public class BasicMemoryVectorDatabase : MemoryVectorDatabase 7 | { } -------------------------------------------------------------------------------- /docs/overrides/main.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} 2 | 3 | {% block scripts %} 4 | 5 | {{ super() }} 6 | 7 | 8 | 9 | 10 | 11 | {% endblock %} -------------------------------------------------------------------------------- /src/SharpVectorPerformance/Program.cs: -------------------------------------------------------------------------------- 1 | // See https://aka.ms/new-console-template for more information 2 | using BenchmarkDotNet.Running; 3 | 4 | namespace SharpVectorPerformance; 5 | 6 | public class Program 7 | { 8 | public static void Main(string[] args) 9 | { 10 | BenchmarkRunner.Run(); 11 | BenchmarkRunner.Run(); 12 | } 13 | } -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector.Ollama/docs/README.md: -------------------------------------------------------------------------------- 1 | Build5Nines.SharpVector.Ollama is the lightweight in-memory Vector Database for use in any .NET application that connects to an embeddings model running in Ollama for generating the text embeddings. 2 | 3 | The `Build5Nines.SharpVector.Ollama.BasicOllamaMemoryVectorDatabase` class uses an Ollama embeddings model with Cosine similarity search. 4 | 5 | [Read Documentation](https://sharpvector.build5nines.com) 6 | -------------------------------------------------------------------------------- /src/ConsoleTest/ConsoleTest.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | Exe 9 | net8.0 10 | enable 11 | enable 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /src/OllamaConsoleTest/OllamaConsoleTest.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | Exe 10 | net8.0 11 | enable 12 | enable 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector/Data/TextChunkingMethod.cs: -------------------------------------------------------------------------------- 1 | namespace Build5Nines.SharpVector.Data; 2 | 3 | public enum TextChunkingMethod 4 | { 5 | /// 6 | /// Split the text into paragraphs 7 | /// 8 | Paragraph, 9 | /// 10 | /// Split the text into sentences 11 | /// 12 | Sentence, 13 | /// 14 | /// Split the text into fixed length chunks 15 | /// 16 | FixedLength, 17 | /// 18 | /// Split the text into overlapping windows 19 | /// 20 | OverlappingWindow 21 | } -------------------------------------------------------------------------------- /src/SharpVectorPerformance/SharpVectorPerformance.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Exe 5 | net8.0 6 | enable 7 | enable 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector.Playground/Build5Nines.SharpVector.Playground.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | net8.0 5 | enable 6 | enable 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector.OpenAI/BasicOpenAIMemoryVectorDatabase.cs: -------------------------------------------------------------------------------- 1 | using OpenAI.Embeddings; 2 | 3 | namespace Build5Nines.SharpVector.OpenAI 4 | { 5 | 6 | /// 7 | /// A basic implementation of an vector database that uses an in-memory dictionary to store vectors generated using the specified OpenAI embeddings client, with integer keys and string metadata values. 8 | /// 9 | public class BasicOpenAIMemoryVectorDatabase : OpenAIMemoryVectorDatabase 10 | { 11 | public BasicOpenAIMemoryVectorDatabase(EmbeddingClient embeddingClient) 12 | : base(embeddingClient) 13 | { } 14 | } 15 | 16 | } -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector/Id/ISequentialIdGenerator.cs: -------------------------------------------------------------------------------- 1 | namespace Build5Nines.SharpVector.Id; 2 | 3 | /// 4 | /// Interface for ID generators that support setting the most recent generated ID (sequential/numeric style). 5 | /// 6 | /// The ID type. 7 | public interface ISequentialIdGenerator : IIdGenerator 8 | where TId : notnull 9 | { 10 | /// 11 | /// Sets the most recent ID value so the next generated ID will continue the sequence. 12 | /// 13 | /// The most recently used/generated ID. 14 | void SetMostRecent(TId mostRecentId); 15 | } 16 | -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector.Playground/Components/_Imports.razor: -------------------------------------------------------------------------------- 1 | @using System.Net.Http 2 | @using System.Net.Http.Json 3 | @using Microsoft.AspNetCore.Components.Forms 4 | @using Microsoft.AspNetCore.Components.Routing 5 | @using Microsoft.AspNetCore.Components.Web 6 | @using static Microsoft.AspNetCore.Components.Web.RenderMode 7 | @using Microsoft.AspNetCore.Components.Web.Virtualization 8 | @using Microsoft.JSInterop 9 | @using Build5Nines.SharpVector 10 | @using Build5Nines.SharpVector.Data; 11 | @using Build5Nines.SharpVector.Playground 12 | @using Build5Nines.SharpVector.Playground.Components 13 | @using BlazorMonaco 14 | @using BlazorMonaco.Editor 15 | @using BlazorMonaco.Languages -------------------------------------------------------------------------------- /samples/azure/document-intelligence/b59-azure-doc-intelligence/b59-azure-doc-intelligence.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Exe 5 | net9.0 6 | b59_azure_doc_intelligence 7 | enable 8 | enable 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector.OpenAI/docs/README.md: -------------------------------------------------------------------------------- 1 | Build5Nines.SharpVector.OpenAI is the lightweight in-memory Vector Database for use in any .NET application that connects to an embeddings model running in Azure OpenAI for generating the text embeddings. 2 | 3 | The `Build5Nines.SharpVector.OpenAI.BasicOpenAIMemoryVectorDatabase` class uses an OpenAI Embeddings Client with Cosine similarity search. 4 | 5 | [Read Documentation](https://sharpvector.build5nines.com) 6 | 7 | ## Tutorials 8 | 9 | - [Enhanced In-Memory Text Vector Search in .NET with SharpVector and OpenAI Embeddings](https://build5nines.com/enhanced-in-memory-text-vector-search-in-net-with-sharpvector-and-openai-embeddings/?utm_source=github&utm_medium=sharpvector) by Chris Pietschmann -------------------------------------------------------------------------------- /samples/genai-rag-onnx/genai-rag-onnx.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Exe 5 | net8.0 6 | genai_rag_onnx 7 | enable 8 | enable 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector.Playground/Components/Layout/MainLayout.razor: -------------------------------------------------------------------------------- 1 | @inherits LayoutComponentBase 2 | 3 |
4 | @* *@ 7 | 8 |
9 |
10 |

Build5Nines.SharpVector Playground

11 | @* View Source *@ 12 |
13 | 14 |
15 | @Body 16 |
17 |
18 |
19 | 20 |
21 | An unhandled error has occurred. 22 | Reload 23 | 🗙 24 |
25 | -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector.Playground/Components/Layout/NavMenu.razor: -------------------------------------------------------------------------------- 1 | 6 | 7 | 8 | 9 | 18 | 19 | -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector/DatabaseInfo.cs: -------------------------------------------------------------------------------- 1 | namespace Build5Nines.SharpVector; 2 | 3 | public class DatabaseInfo 4 | { 5 | internal const string SupportedVersion = "1.0.0"; 6 | internal const string SupportedSchema = "Build5Nines.SharpVector"; 7 | 8 | public DatabaseInfo() 9 | : this(null, null, null) 10 | { } 11 | public DatabaseInfo(string? classType) 12 | : this(SupportedSchema, SupportedVersion, classType) 13 | { } 14 | 15 | public DatabaseInfo(string? schema, string? version, string? classType) 16 | { 17 | Schema = schema; 18 | Version = version; 19 | ClassType = classType; 20 | } 21 | 22 | public string? Schema { get; set; } 23 | public string? Version { get; set; } 24 | public string? ClassType { get; set; } 25 | } -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector/Id/NumericIdGenerator.cs: -------------------------------------------------------------------------------- 1 | namespace Build5Nines.SharpVector.Id; 2 | 3 | public class NumericIdGenerator : ISequentialIdGenerator 4 | where TId : struct 5 | { 6 | public NumericIdGenerator() 7 | { } 8 | 9 | public NumericIdGenerator(TId mostRecentId) : this() 10 | { 11 | this._lastId = mostRecentId; 12 | } 13 | 14 | private readonly object _lock = new object(); 15 | private TId _lastId = default(TId); 16 | 17 | public TId NewId() { 18 | lock(_lock) { 19 | dynamic current = _lastId; 20 | current++; 21 | _lastId = current; 22 | return _lastId; 23 | } 24 | } 25 | 26 | public void SetMostRecent(TId mostRecentId) 27 | { 28 | lock(_lock) { 29 | _lastId = mostRecentId; 30 | } 31 | } 32 | } -------------------------------------------------------------------------------- /src/OpenAIConsoleTest/OpenAIConsoleTest.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Exe 5 | net8.0 6 | enable 7 | enable 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector/Embeddings/IBatchEmbeddingsGenerator.cs: -------------------------------------------------------------------------------- 1 | namespace Build5Nines.SharpVector.Embeddings; 2 | 3 | /// 4 | /// Optional capability for embeddings generators to support batch embedding of multiple texts. 5 | /// Implementations can leverage provider APIs that accept multi-input requests for better performance. 6 | /// 7 | public interface IBatchEmbeddingsGenerator : IEmbeddingsGenerator 8 | { 9 | /// 10 | /// Generates embeddings for multiple input texts in a single call when supported. 11 | /// 12 | /// Collection of texts to embed. Order should be preserved in output. 13 | /// A read-only list of embeddings vectors corresponding to the input order. 14 | Task> GenerateEmbeddingsAsync(IEnumerable texts); 15 | } 16 | -------------------------------------------------------------------------------- /src/.vscode/tasks.json: -------------------------------------------------------------------------------- 1 | { 2 | // See https://go.microsoft.com/fwlink/?LinkId=733558 3 | // for the documentation about the tasks.json format 4 | "version": "2.0.0", 5 | "tasks": [ 6 | { 7 | "label": "build", 8 | "command": "dotnet", 9 | "type": "shell", 10 | "args": [ 11 | "build", 12 | // Ask dotnet build to generate full paths for file names. 13 | "/property:GenerateFullPaths=true", 14 | // Do not generate summary otherwise it leads to duplicate errors in Problems panel 15 | "/consoleloggerparameters:NoSummary" 16 | ], 17 | "group": "build", 18 | "presentation": { 19 | "reveal": "silent" 20 | }, 21 | "problemMatcher": "$msCompile" 22 | } 23 | ] 24 | } -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector.Playground/Program.cs: -------------------------------------------------------------------------------- 1 | using Build5Nines.SharpVector.Playground.Components; 2 | 3 | var builder = WebApplication.CreateBuilder(args); 4 | 5 | // Add services to the container. 6 | builder.Services.AddRazorComponents() 7 | .AddInteractiveServerComponents(); 8 | 9 | var app = builder.Build(); 10 | 11 | // Configure the HTTP request pipeline. 12 | if (!app.Environment.IsDevelopment()) 13 | { 14 | app.UseExceptionHandler("/Error", createScopeForErrors: true); 15 | // The default HSTS value is 30 days. You may want to change this for production scenarios, see https://aka.ms/aspnetcore-hsts. 16 | app.UseHsts(); 17 | } 18 | 19 | app.UseHttpsRedirection(); 20 | 21 | app.UseStaticFiles(); 22 | app.UseAntiforgery(); 23 | 24 | app.MapRazorComponents() 25 | .AddInteractiveServerRenderMode(); 26 | 27 | app.Run(); 28 | -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector/VectorTextDatabaseItem.cs: -------------------------------------------------------------------------------- 1 | namespace Build5Nines.SharpVector; 2 | 3 | public interface IVectorTextDatabaseItem 4 | { 5 | TId Id { get; } 6 | TDocument Text { get; } 7 | TMetadata? Metadata { get; } 8 | float[] Vector { get; } 9 | } 10 | 11 | public class VectorTextDatabaseItem 12 | : IVectorTextDatabaseItem 13 | { 14 | public VectorTextDatabaseItem(TId id, TDocument text, TMetadata? metadata, float[] vector) 15 | { 16 | Id = id; 17 | Text = text; 18 | Metadata = metadata; 19 | Vector = vector; 20 | } 21 | 22 | public TId Id { get; private set; } 23 | public TDocument Text { get; private set; } 24 | public TMetadata? Metadata { get; private set; } 25 | public float[] Vector { get; private set; } 26 | } 27 | -------------------------------------------------------------------------------- /docs/docs/embeddings/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Embeddings 3 | --- 4 | # :fontawesome-solid-square-binary: Embeddings 5 | 6 | `Build5Nines.SharpVector` includes the following support for using AI Models to generate the text embeddings for the vector database instead of generating them locally. The use of an AI Embeddings model can greatly increase the quality of the semantic search. 7 | 8 |
9 | 10 | - :simple-openai:{ .lg .middle } __OpenAI Embeddings__ 11 | 12 | --- 13 | 14 | Use OpenAI and/or Azure OpenAI Service embeddings models like `text-embedding-ada-002` or others. 15 | 16 | [:octicons-arrow-right-24: Getting started](openai/index.md) 17 | 18 | - :simple-ollama:{ .lg .middle } __Ollama Embeddings__ 19 | 20 | --- 21 | 22 | Use Ollama embeddings models like `nomic-embed-text` or others. 23 | 24 | [:octicons-arrow-right-24: Get Started](ollama/index.md) 25 | 26 |
-------------------------------------------------------------------------------- /src/OllamaConsoleTest/Program.cs: -------------------------------------------------------------------------------- 1 | using Build5Nines.SharpVector; 2 | using Build5Nines.SharpVector.Ollama; 3 | using Build5Nines.SharpVector.Ollama.Embeddings; 4 | 5 | 6 | Console.WriteLine("Test OllamaEmbeddingsGenerator"); 7 | 8 | var generator = new OllamaEmbeddingsGenerator("nomic-embed-text"); 9 | var embeddings = await generator.GenerateEmbeddingsAsync("Hello World"); 10 | 11 | foreach (var item in embeddings) 12 | { 13 | Console.Write(item + ", "); 14 | } 15 | Console.WriteLine(""); 16 | 17 | Console.WriteLine("Test BasicOllamaMemoryVectorDatabase"); 18 | 19 | var vdb = new BasicOllamaMemoryVectorDatabase("nomic-embed-text"); //"http://localhost:11434/api/embeddings", "nomic-embed-text"); 20 | 21 | vdb.AddText("Hello World", "metadata"); 22 | vdb.AddText("Hola", "metadata2"); 23 | 24 | var result = vdb.Search("Hola Senior"); 25 | 26 | foreach (var item in result.Texts) 27 | { 28 | Console.WriteLine($"{item.Text} - {item.Metadata} - {item.VectorComparison}"); 29 | } 30 | -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector.Playground/Components/App.razor: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector/VectorStore/IVectorStoreWithVocabulary.cs: -------------------------------------------------------------------------------- 1 | 2 | using Build5Nines.SharpVector.Vocabulary; 3 | 4 | namespace Build5Nines.SharpVector.VectorStore; 5 | 6 | /// 7 | /// Interface for a vector store with a vocabulary. 8 | /// 9 | /// 10 | /// 11 | /// 12 | /// 13 | /// 14 | public interface IVectorStoreWithVocabulary 15 | : IVectorStore 16 | where TId : notnull 17 | where TVocabularyKey : notnull 18 | where TVocabularyStore : IVocabularyStore 19 | { 20 | /// 21 | /// The Vocabulary Store used to store the vocabulary of the database 22 | /// 23 | TVocabularyStore VocabularyStore { get; } 24 | } -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector.Ollama/BasicOllamaMemoryVectorDatabase.cs: -------------------------------------------------------------------------------- 1 | using Build5Nines.SharpVector.Embeddings; 2 | 3 | namespace Build5Nines.SharpVector.Ollama; 4 | 5 | /// 6 | /// A basic implementation of an vector database that uses an in-memory dictionary to store vectors generated using the specified OpenAI embeddings client, with integer keys and string metadata values. 7 | /// 8 | public class BasicOllamaMemoryVectorDatabase : OllamaMemoryVectorDatabase 9 | { 10 | public BasicOllamaMemoryVectorDatabase(string model) 11 | : this( 12 | new Embeddings.OllamaEmbeddingsGenerator(model) 13 | ) 14 | { } 15 | 16 | public BasicOllamaMemoryVectorDatabase(string ollamaEndpoint, string model) 17 | : this( 18 | new Embeddings.OllamaEmbeddingsGenerator(ollamaEndpoint, model) 19 | ) 20 | { } 21 | 22 | public BasicOllamaMemoryVectorDatabase(IEmbeddingsGenerator embeddingsGenerator) 23 | : base(embeddingsGenerator) 24 | { } 25 | } -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Build5Nines LLC 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector/docs/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Build5Nines LLC 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector.Ollama/docs/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Build5Nines LLC 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector.OpenAI/docs/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Build5Nines LLC 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector/VectorStore/MemoryDictionaryVectorStoreWithVocabulary.cs: -------------------------------------------------------------------------------- 1 | 2 | using Build5Nines.SharpVector.VectorStore; 3 | using Build5Nines.SharpVector.Vocabulary; 4 | 5 | /// 6 | /// A thread safe simple in-memory database for storing and querying vectorized text items with a vocabulary. 7 | /// 8 | /// 9 | /// 10 | /// 11 | /// 12 | /// 13 | public class MemoryDictionaryVectorStoreWithVocabulary 14 | : MemoryDictionaryVectorStore, IVectorStoreWithVocabulary 15 | where TId : notnull 16 | where TVocabularyKey : notnull 17 | where TVocabularyStore : IVocabularyStore 18 | { 19 | public TVocabularyStore VocabularyStore { get; } 20 | 21 | public MemoryDictionaryVectorStoreWithVocabulary(TVocabularyStore vocabularyStore) 22 | { 23 | VocabularyStore = vocabularyStore; 24 | } 25 | } -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector/Build5Nines.SharpVector.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | net8.0 5 | enable 6 | enable 7 | true 8 | 9 | Build5Nines.SharpVector 10 | https://sharpvector.build5nines.com 11 | https://github.com/Build5Nines/SharpVector 12 | 2.2.0 13 | Lightweight In-memory Vector Database to embed in any .NET Applications 14 | Copyright (c) 2025 Build5Nines LLC 15 | README.md 16 | LICENSE 17 | Chris Pietschmann 18 | Build5Nines LLC 19 | vector;search;database;data;rag;search;llm;generative ai;ai;genai 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector.Playground/Properties/launchSettings.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "http://json.schemastore.org/launchsettings.json", 3 | "iisSettings": { 4 | "windowsAuthentication": false, 5 | "anonymousAuthentication": true, 6 | "iisExpress": { 7 | "applicationUrl": "http://localhost:55365", 8 | "sslPort": 44358 9 | } 10 | }, 11 | "profiles": { 12 | "http": { 13 | "commandName": "Project", 14 | "dotnetRunMessages": true, 15 | "launchBrowser": true, 16 | "applicationUrl": "http://localhost:5188", 17 | "environmentVariables": { 18 | "ASPNETCORE_ENVIRONMENT": "Development" 19 | } 20 | }, 21 | "https": { 22 | "commandName": "Project", 23 | "dotnetRunMessages": true, 24 | "launchBrowser": true, 25 | "applicationUrl": "https://localhost:7156;http://localhost:5188", 26 | "environmentVariables": { 27 | "ASPNETCORE_ENVIRONMENT": "Development" 28 | } 29 | }, 30 | "IIS Express": { 31 | "commandName": "IISExpress", 32 | "launchBrowser": true, 33 | "environmentVariables": { 34 | "ASPNETCORE_ENVIRONMENT": "Development" 35 | } 36 | } 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/SharpVectorOpenAITest/SharpVectorOpenAITest.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | net8.0 5 | enable 6 | enable 7 | 8 | false 9 | true 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector/Data/TextChunkingOptions.cs: -------------------------------------------------------------------------------- 1 | namespace Build5Nines.SharpVector.Data; 2 | 3 | public class TextChunkingOptions 4 | { 5 | public TextChunkingOptions() 6 | { 7 | Method = TextChunkingMethod.Paragraph; 8 | ChunkSize = 100; 9 | #pragma warning disable CS8603 // Possible null reference return. 10 | RetrieveMetadata = (chunk) => default; 11 | #pragma warning restore CS8603 // Possible null reference return. 12 | OverlapSize = 50; 13 | } 14 | 15 | /// 16 | /// The method to use for chunking the text. Default is Paragraph. 17 | /// 18 | public TextChunkingMethod Method { get; set; } 19 | 20 | /// 21 | /// The length in tokens (aka "words") of each chunk of text. Default is 100. 22 | /// Only used by TextChunkingMethod.FixedLength and TextChunkingMethod.OverlappingWindow. 23 | /// 24 | public int ChunkSize { get; set; } 25 | 26 | /// 27 | /// Lambda function to retrieve custom metadata for each chunk 28 | /// 29 | public Func RetrieveMetadata { get; set; } 30 | 31 | /// 32 | /// The number of words to overlap text chunks when using using TextChunkingMethod.OverlappingWindow. Default is 50. 33 | /// 34 | public int OverlapSize { get; set; } 35 | } 36 | -------------------------------------------------------------------------------- /docs/docs/license/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: License 3 | description: Review the MIT license terms for using and contributing to the SharpVector open-source project. 4 | date: 2025-04-13 5 | --- 6 | 7 | # :octicons-file-badge-24: License 8 | 9 | ```text 10 | MIT License 11 | 12 | Copyright (c) 2025 Build5Nines LLC 13 | 14 | Permission is hereby granted, free of charge, to any person obtaining a copy 15 | of this software and associated documentation files (the "Software"), to deal 16 | in the Software without restriction, including without limitation the rights 17 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 18 | copies of the Software, and to permit persons to whom the Software is 19 | furnished to do so, subject to the following conditions: 20 | 21 | The above copyright notice and this permission notice shall be included in all 22 | copies or substantial portions of the Software. 23 | 24 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 25 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 26 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 27 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 28 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 29 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 30 | SOFTWARE. 31 | ``` 32 | -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector.Playground/Components/Pages/Error.razor: -------------------------------------------------------------------------------- 1 | @page "/Error" 2 | @using System.Diagnostics 3 | 4 | Error 5 | 6 |

Error.

7 |

An error occurred while processing your request.

8 | 9 | @if (ShowRequestId) 10 | { 11 |

12 | Request ID: @RequestId 13 |

14 | } 15 | 16 |

Development Mode

17 |

18 | Swapping to Development environment will display more detailed information about the error that occurred. 19 |

20 |

21 | The Development environment shouldn't be enabled for deployed applications. 22 | It can result in displaying sensitive information from exceptions to end users. 23 | For local debugging, enable the Development environment by setting the ASPNETCORE_ENVIRONMENT environment variable to Development 24 | and restarting the app. 25 |

26 | 27 | @code{ 28 | [CascadingParameter] 29 | private HttpContext? HttpContext { get; set; } 30 | 31 | private string? RequestId { get; set; } 32 | private bool ShowRequestId => !string.IsNullOrEmpty(RequestId); 33 | 34 | protected override void OnInitialized() => 35 | RequestId = Activity.Current?.Id ?? HttpContext?.TraceIdentifier; 36 | } 37 | -------------------------------------------------------------------------------- /src/SharpVectorTest/SharpVectorTest.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | net8.0 5 | enable 6 | enable 7 | 8 | false 9 | true 10 | 11 | 12 | 13 | 14 | runtime; build; native; contentfiles; analyzers; buildtransitive 15 | all 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | Always 34 | 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /docs/docs/samples/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Samples 3 | description: Explore real-world code samples to see SharpVector in action. Build search engines, intelligent note apps, and more. 4 | --- 5 | 6 | # :material-run-fast: Samples 7 | 8 | ## Sample Console App 9 | 10 | The sample console app in this repo show example usage of `Build5Nines.SharpVector`. 11 | 12 | It loads a list of movie titles and descriptions from a JSON file, then allows the user to type in prompts to search the database and return the best matches. 13 | 14 | [View Source](https://github.com/Build5Nines/SharpVector/blob/main/src/ConsoleTest/Program.cs) 15 | 16 | Here's a screenshot of the test console app running: 17 | 18 | ![Screenshot of sample console app in the terminal](../images/samples/build5nines-sharpvector-console-screenshot.jpg) 19 | 20 | ## Generative AI + RAG + ONNX Model 21 | 22 | This example takes a [ONNX](https://onnxruntime.ai/docs/get-started/with-csharp.html) Generative AI app and extends it to implement Retrieval Augmented Generation (RAG) using `Build5Nines.SharpVector`. 23 | 24 | [View Source](https://github.com/Build5Nines/SharpVector/blob/main/samples/genai-rag-onnx/Program.cs) 25 | 26 | Read the "[Build a Generative AI + RAG App in C# with Phi-3, ONNX, and SharpVector 27 | ](https://build5nines.com/build-a-generative-ai-rag-app-in-c-with-phi-3-onnx-and-sharpvector/)" article for a detailed explanation of building this sample app. 28 | -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector.OpenAI/OpenAIMemoryVectorDatabase.cs: -------------------------------------------------------------------------------- 1 | using Build5Nines.SharpVector.Id; 2 | using Build5Nines.SharpVector.VectorCompare; 3 | using Build5Nines.SharpVector.VectorStore; 4 | using OpenAI.Embeddings; 5 | 6 | namespace Build5Nines.SharpVector.OpenAI; 7 | 8 | /// 9 | /// An interface for a vector database that uses OpenAI for embedding generation. 10 | /// 11 | /// 12 | /// 13 | public interface IOpenAIMemoryVectorDatabase : IVectorDatabase 14 | where TId : notnull 15 | { } 16 | 17 | /// 18 | /// A simple in-memory database for storing and querying vectorized text items. 19 | /// This database uses OpenAI to generate embeddings, and performs Cosine similarity search. 20 | /// 21 | /// Defines the data type for the Metadata stored with the Text. 22 | public class OpenAIMemoryVectorDatabase 23 | : OpenAIMemoryVectorDatabaseBase< 24 | int, 25 | TMetadata, 26 | MemoryDictionaryVectorStore, 27 | IntIdGenerator, 28 | CosineSimilarityVectorComparer 29 | > 30 | { 31 | public OpenAIMemoryVectorDatabase(EmbeddingClient embeddingClient) 32 | : base( 33 | embeddingClient, 34 | new MemoryDictionaryVectorStore() 35 | ) 36 | { } 37 | } 38 | -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector/Vocabulary/IVocabularyStore.cs: -------------------------------------------------------------------------------- 1 | namespace Build5Nines.SharpVector.Vocabulary; 2 | 3 | 4 | public interface IVocabularyStore 5 | where TKey : notnull 6 | { 7 | /// 8 | /// Updates the vocabulary store 9 | /// 10 | /// 11 | /// 12 | void Update(IEnumerable tokens); 13 | 14 | /// 15 | /// Updates the vocabulary store asynchronously 16 | /// 17 | /// 18 | /// 19 | Task UpdateAsync(IEnumerable tokens); 20 | 21 | /// 22 | /// The number of items in the vocabulary store 23 | /// 24 | TValue Count { get; } 25 | 26 | /// 27 | /// Retrieves the index of a token 28 | /// 29 | /// 30 | /// 31 | /// 32 | bool TryGetValue(TKey token, out int index); 33 | 34 | /// 35 | /// Serializes the Vocabulary Store to a JSON stream 36 | /// 37 | /// 38 | /// 39 | Task SerializeToJsonStreamAsync(Stream stream); 40 | 41 | /// 42 | /// Deserializes the Vocabulary Store from a JSON stream 43 | /// 44 | /// 45 | /// 46 | Task DeserializeFromJsonStreamAsync(Stream stream); 47 | } -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector.Ollama/Build5Nines.SharpVector.Ollama.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | net8.0 5 | enable 6 | enable 7 | true 8 | 9 | Build5Nines.SharpVector.Ollama 10 | https://sharpvector.build5nines.com 11 | https://github.com/Build5Nines/SharpVector 12 | 2.0.3 13 | Lightweight In-memory Vector Database to embed in any .NET Applications that integrates with Ollama Embedding models for vector generation. 14 | Copyright (c) 2025 Build5Nines LLC 15 | README.md 16 | LICENSE 17 | Chris Pietschmann 18 | Build5Nines LLC 19 | vector;search;database;data;rag;ollama;embeddings;azure;microsoft; 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /docs/docs/resources/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Resources 3 | description: Dive deeper with curated resources, links, and tools for working with vector databases, semantic search, and SharpVector. 4 | --- 5 | 6 | # :octicons-link-24: Resources 7 | 8 | ## Tutorials 9 | 10 | Here's a couple helpful tutorial links with additional documentation and examples on using `Build5Nines.SharpVector` in your own projects: 11 | 12 | - [Perform Vector Database Similarity Search in .NET Apps using Build5Nines.SharpVector](https://build5nines.com/using-build5nines-sharpvector-for-vector-similarity-search-in-net-applications/?utm_source=github&utm_medium=sharpvector) by Chris Pietschmann 13 | - [Enhanced In-Memory Text Vector Search in .NET with SharpVector and OpenAI Embeddings](https://build5nines.com/enhanced-in-memory-text-vector-search-in-net-with-sharpvector-and-openai-embeddings/?utm_source=github&utm_medium=sharpvector) by Chris Pietschmann 14 | - [Build a Generative AI + RAG App in C# with Phi-3, ONNX, and SharpVector](https://build5nines.com/build-a-generative-ai-rag-app-in-c-with-phi-3-onnx-and-sharpvector/?utm_source=github&utm_medium=sharpvector) by Chris Pietschmann 15 | - [Implementing Local RAG using Phi-3 ONNX Runtime and Sidecar Pattern on Linux App Service](https://azure.github.io/AppService/2024/09/03/Phi3-vector.html) by Tulika Chaudharie (Principal Product Manager at Microsoft for Azure App Service) 16 | - [Semantic Search PDF Files Locally using .NET / C# and Build5Nines.SharpVector](https://build5nines.com/semantic-search-pdf-files-locally-using-c-and-build5nines-sharpvector/) by Chris Pietschmann 17 | -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector/BasicDiskMemoryVectorDatabaseBase.cs: -------------------------------------------------------------------------------- 1 | using Build5Nines.SharpVector.Id; 2 | using Build5Nines.SharpVector.Preprocessing; 3 | using Build5Nines.SharpVector.Vocabulary; 4 | using Build5Nines.SharpVector.Vectorization; 5 | using Build5Nines.SharpVector.VectorCompare; 6 | using Build5Nines.SharpVector.VectorStore; 7 | 8 | namespace Build5Nines.SharpVector; 9 | 10 | /// 11 | /// Base class for an on-disk vector database. Mirrors MemoryVectorDatabaseBase generic composition 12 | /// while using disk-backed stores for persistence. 13 | /// 14 | public abstract class BasicDiskMemoryVectorDatabaseBase 15 | : VectorDatabaseBase 16 | where TId : notnull 17 | where TVocabularyKey : notnull 18 | where TVocabularyValue : notnull 19 | where TVectorStore : IVectorStoreWithVocabulary 20 | where TVocabularyStore : IVocabularyStore 21 | where TIdGenerator : IIdGenerator, new() 22 | where TTextPreprocessor : ITextPreprocessor, new() 23 | where TVectorizer : IVectorizer, new() 24 | where TVectorComparer : IVectorComparer, new() 25 | { 26 | protected BasicDiskMemoryVectorDatabaseBase(TVectorStore vectorStore) 27 | : base(vectorStore) 28 | { } 29 | } 30 | -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector.OpenAI/Build5Nines.SharpVector.OpenAI.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | net8.0 5 | enable 6 | enable 7 | true 8 | 9 | Build5Nines.SharpVector.OpenAI 10 | https://sharpvector.build5nines.com 11 | https://github.com/Build5Nines/SharpVector 12 | 2.0.4 13 | Lightweight In-memory Vector Database to embed in any .NET Applications that integrates with OpenAI Embedding model for vector generation. 14 | Copyright (c) 2025 Build5Nines LLC 15 | README.md 16 | LICENSE 17 | Chris Pietschmann 18 | Build5Nines LLC 19 | vector;search;database;data;rag;openai;embeddings;azure;microsoft; 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector/Vectorization/IVectorizer.cs: -------------------------------------------------------------------------------- 1 | namespace Build5Nines.SharpVector.Vectorization; 2 | 3 | using Build5Nines.SharpVector.Vocabulary; 4 | 5 | /// 6 | /// An interface for classes that vectorizes a collection of tokens 7 | /// 8 | /// 9 | /// 10 | public interface IVectorizer 11 | where TVocabularyKey : notnull 12 | where TVocabularyValue : notnull 13 | { 14 | /// 15 | /// Generates vectors from tokens using the vocabulary. 16 | /// 17 | /// The vocabulary store to use for vectorization 18 | /// The tokens to generate a vector from 19 | /// 20 | float[] GenerateVectorFromTokens(IVocabularyStore vocabularyStore, IEnumerable tokens); 21 | 22 | /// 23 | /// Generates vectors from tokens using the vocabulary asynchronously. 24 | /// 25 | /// 26 | /// 27 | /// 28 | Task GenerateVectorFromTokensAsync(IVocabularyStore vocabularyStore, IEnumerable tokens); 29 | 30 | /// 31 | /// Method to normalize vectors to a specific length by padding or truncating 32 | /// 33 | /// 34 | /// 35 | /// 36 | float[] NormalizeVector(float[] vector, TVocabularyValue length); 37 | } -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector/VectorTextItem.cs: -------------------------------------------------------------------------------- 1 | namespace Build5Nines.SharpVector; 2 | 3 | /// 4 | /// An interface for storing a text with its metadata and vector data. 5 | /// 6 | /// 7 | /// 8 | public interface IVectorTextItem 9 | { 10 | TDocument Text { get; set; } 11 | TMetadata? Metadata { get; set; } 12 | float[] Vector { get; set; } 13 | } 14 | 15 | /// 16 | /// An interface for storing a text with its metadata and vector. 17 | /// 18 | /// 19 | public interface IVectorTextItem : IVectorTextItem 20 | { } 21 | 22 | /// 23 | /// A class for storing a text with its metadata and vector. 24 | /// 25 | /// 26 | /// 27 | public class VectorTextItem : IVectorTextItem 28 | { 29 | public VectorTextItem(TDocument text, TMetadata? metadata, float[] vector) 30 | { 31 | Text = text; 32 | Metadata = metadata; 33 | Vector = vector; 34 | } 35 | 36 | public TDocument Text { get; set; } 37 | public TMetadata? Metadata { get; set; } 38 | public float[] Vector { get; set; } 39 | } 40 | 41 | /// 42 | /// A class for storing a text with its metadata and vector data. 43 | /// 44 | /// 45 | public class VectorTextItem : VectorTextItem, IVectorTextItem 46 | { 47 | public VectorTextItem(string text, TMetadata? metadata, float[] vector) 48 | : base(text, metadata, vector) 49 | { } 50 | } -------------------------------------------------------------------------------- /src/SharpVectorTest/Regression/RegressionTests.cs: -------------------------------------------------------------------------------- 1 | namespace SharpVectorTest.Regression; 2 | 3 | using System.Diagnostics; 4 | using System.Threading.Tasks; 5 | using Build5Nines.SharpVector; 6 | using Build5Nines.SharpVector.Id; 7 | using Build5Nines.SharpVector.Preprocessing; 8 | using Build5Nines.SharpVector.VectorCompare; 9 | using Build5Nines.SharpVector.Vectorization; 10 | using Build5Nines.SharpVector.VectorStore; 11 | using Build5Nines.SharpVector.Vocabulary; 12 | 13 | [TestClass] 14 | public class RegressionTests 15 | { 16 | [TestMethod] 17 | public void VectorDatabaseVersion_2_0_2_001() 18 | { 19 | var vdb = new MemoryVectorDatabase(); 20 | 21 | vdb.LoadFromFile("Regression/regression-vector-database-v2.0.2.b59vdb"); 22 | 23 | var results = vdb.Search("Lion King"); 24 | 25 | Assert.AreEqual(1, results.Texts.Count()); 26 | Assert.IsTrue(results.Texts.First().Text.Contains("Lion King")); 27 | Assert.AreEqual("{ value: \"JSON Metadata Value\" }", results.Texts.First().Metadata); 28 | Assert.AreEqual(0.3396831452846527, results.Texts.First().Similarity); 29 | } 30 | 31 | [TestMethod] 32 | public async Task LoadVectorDatabaseInfo_2_0_2_001() 33 | { 34 | var file = new FileStream("Regression/regression-vector-database-v2.0.2.b59vdb", FileMode.Open, FileAccess.Read); 35 | var dbinfo = await DatabaseFile.LoadDatabaseInfoFromZipArchiveAsync(file); 36 | 37 | Assert.AreEqual("Build5Nines.SharpVector", dbinfo.Schema); 38 | Assert.AreEqual("1.0.0", dbinfo.Version); 39 | Assert.AreEqual("Build5Nines.SharpVector.MemoryVectorDatabase\u00601[[System.String, System.Private.CoreLib, Version=8.0.0.0, Culture=neutral, PublicKeyToken=7cec85d7bea7798e]]", dbinfo.ClassType); 40 | } 41 | } -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector/IVectorDatabaseExtensions.cs: -------------------------------------------------------------------------------- 1 | using System.Text; 2 | 3 | namespace Build5Nines.SharpVector; 4 | 5 | /// 6 | /// Save and Load extnesion methods fo IVectorDatabase<> 7 | /// 8 | public static class IVectorDatabaseExtensions 9 | { 10 | public static async Task SaveToFileAsync(this IVectorDatabase vectorDatabase, string filePath) 11 | where TId : notnull 12 | { 13 | using (var stream = new FileStream(filePath, FileMode.Create, FileAccess.Write)) 14 | { 15 | await vectorDatabase.SerializeToBinaryStreamAsync(stream); 16 | } 17 | } 18 | 19 | public static void SaveToFile(this IVectorDatabase vectorDatabase, string filePath) 20 | where TId : notnull 21 | { 22 | using (var stream = new FileStream(filePath, FileMode.Create, FileAccess.Write)) 23 | { 24 | vectorDatabase.SerializeToBinaryStream(stream); 25 | } 26 | } 27 | 28 | public static async Task LoadFromFileAsync(this IVectorDatabase vectorDatabase, string filePath) 29 | where TId : notnull 30 | { 31 | using (var stream = new FileStream(filePath, FileMode.Open, FileAccess.Read)) 32 | { 33 | await vectorDatabase.DeserializeFromBinaryStreamAsync(stream); 34 | } 35 | } 36 | 37 | public static void LoadFromFile(this IVectorDatabase vectorDatabase, string filePath) 38 | where TId : notnull 39 | { 40 | using (var stream = new FileStream(filePath, FileMode.Open, FileAccess.Read)) 41 | { 42 | vectorDatabase.DeserializeFromBinaryStream(stream); 43 | } 44 | } 45 | } -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector/BasicDiskVectorDatabase.cs: -------------------------------------------------------------------------------- 1 | using Build5Nines.SharpVector.Vocabulary; 2 | using Build5Nines.SharpVector.Id; 3 | using Build5Nines.SharpVector.Preprocessing; 4 | using Build5Nines.SharpVector.Vectorization; 5 | using Build5Nines.SharpVector.VectorCompare; 6 | using Build5Nines.SharpVector.VectorStore; 7 | 8 | namespace Build5Nines.SharpVector; 9 | 10 | /// 11 | /// A basic disk-backed vector database using Bag-of-Words, Cosine similarity, 12 | /// disk-backed vector store and vocabulary store. Uses int IDs and string metadata. 13 | /// 14 | public class BasicDiskVectorDatabase 15 | : BasicDiskMemoryVectorDatabaseBase< 16 | int, 17 | TMetadata, 18 | BasicDiskVectorStore, string, int>, 19 | BasicDiskVocabularyStore, 20 | string, int, 21 | IntIdGenerator, 22 | BasicTextPreprocessor, 23 | BagOfWordsVectorizer, 24 | CosineSimilarityVectorComparer 25 | >, IMemoryVectorDatabase, IVectorDatabase 26 | { 27 | public BasicDiskVectorDatabase(string rootPath) 28 | : base( 29 | new BasicDiskVectorStore, string, int>( 30 | rootPath, 31 | new BasicDiskVocabularyStore(rootPath) 32 | ) 33 | ) 34 | { } 35 | 36 | [Obsolete("Use DeserializeFromBinaryStreamAsync instead.")] 37 | public override async Task DeserializeFromJsonStreamAsync(Stream stream) 38 | { 39 | await DeserializeFromBinaryStreamAsync(stream); 40 | } 41 | 42 | [Obsolete("Use DeserializeFromBinaryStream instead.")] 43 | public override void DeserializeFromJsonStream(Stream stream) 44 | { 45 | DeserializeFromBinaryStream(stream); 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector.Ollama/OllamaMemoryVectorDatabase.cs: -------------------------------------------------------------------------------- 1 | using Build5Nines.SharpVector.Id; 2 | using Build5Nines.SharpVector.VectorCompare; 3 | using Build5Nines.SharpVector.VectorStore; 4 | using Build5Nines.SharpVector.Embeddings; 5 | 6 | namespace Build5Nines.SharpVector.Ollama; 7 | 8 | /// 9 | /// An interface for a vector database that uses OpenAI for embedding generation. 10 | /// 11 | /// 12 | /// 13 | public interface IOllamaMemoryVectorDatabase : IVectorDatabase 14 | where TId : notnull 15 | { } 16 | 17 | /// 18 | /// A simple in-memory database for storing and querying vectorized text items. 19 | /// This database uses OpenAI to generate embeddings, and performs Cosine similarity search. 20 | /// 21 | /// Defines the data type for the Metadata stored with the Text. 22 | public class OllamaMemoryVectorDatabase 23 | : MemoryVectorDatabaseBase< 24 | int, 25 | TMetadata, 26 | MemoryDictionaryVectorStore, 27 | IntIdGenerator, 28 | CosineSimilarityVectorComparer 29 | >, IOllamaMemoryVectorDatabase 30 | { 31 | public OllamaMemoryVectorDatabase(string model) 32 | : this( 33 | new Embeddings.OllamaEmbeddingsGenerator(model) 34 | ) 35 | { } 36 | 37 | public OllamaMemoryVectorDatabase(string ollamaEndpoint, string model) 38 | : this( 39 | new Embeddings.OllamaEmbeddingsGenerator(ollamaEndpoint, model) 40 | ) 41 | { } 42 | 43 | public OllamaMemoryVectorDatabase(IEmbeddingsGenerator embeddingsGenerator) 44 | : base( 45 | embeddingsGenerator, 46 | new MemoryDictionaryVectorStore() 47 | ) 48 | { } 49 | } 50 | -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector.OpenAI/Embeddings/OpenAIEmbeddingsGenerator.cs: -------------------------------------------------------------------------------- 1 | using Build5Nines.SharpVector.Embeddings; 2 | using OpenAI.Embeddings; 3 | using System.Collections.Generic; 4 | using System.Linq; 5 | 6 | namespace Build5Nines.SharpVector.OpenAI.Embeddings; 7 | 8 | public class OpenAIEmbeddingsGenerator : IBatchEmbeddingsGenerator 9 | { 10 | protected EmbeddingClient EmbeddingClient { get; private set; } 11 | 12 | public OpenAIEmbeddingsGenerator(EmbeddingClient embeddingClient) 13 | { 14 | EmbeddingClient = embeddingClient; 15 | } 16 | public async Task GenerateEmbeddingsAsync(string text) 17 | { 18 | var result = await EmbeddingClient.GenerateEmbeddingAsync(text); 19 | var embedding = result.Value; 20 | var vector = embedding.ToFloats(); 21 | return vector.ToArray(); 22 | } 23 | 24 | /// 25 | /// Generates embeddings for a batch of input texts using the OpenAI embeddings client. 26 | /// This leverages the API's multi-input batching for improved throughput and reduced overhead. 27 | /// 28 | /// Collection of non-empty texts to embed. 29 | /// A list of float vectors aligned to the input order. 30 | public async Task> GenerateEmbeddingsAsync(IEnumerable texts) 31 | { 32 | if (texts is null) throw new ArgumentNullException(nameof(texts)); 33 | 34 | var inputs = texts.ToList(); 35 | if (inputs.Count == 0) 36 | { 37 | return Array.Empty(); 38 | } 39 | 40 | // Call the batch embeddings API once for all inputs. 41 | var batchResult = await EmbeddingClient.GenerateEmbeddingsAsync(inputs); 42 | 43 | // Map the embeddings to float arrays while preserving order. 44 | var vectors = batchResult.Value.Select(e => e.ToFloats().ToArray()).ToList(); 45 | 46 | return vectors; 47 | } 48 | } -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector/VectorCompare/IVectorComparer.cs: -------------------------------------------------------------------------------- 1 | namespace Build5Nines.SharpVector.VectorCompare; 2 | 3 | public interface IVectorComparer 4 | { 5 | /// 6 | /// Calculates a comparison between two vectors 7 | /// 8 | /// 9 | /// 10 | /// 11 | /// 12 | float Calculate(float[] vectorA, float[] vectorB); 13 | 14 | /// 15 | /// Sorts the results of a comparison 16 | /// 17 | /// 18 | /// 19 | /// 20 | IEnumerable> Sort(IEnumerable> results); 21 | 22 | /// 23 | /// Determines if the comparison is within threshold threshold 24 | /// 25 | /// 26 | /// 27 | /// 28 | bool IsWithinThreshold(float? threshold, float vectorComparisonValue); 29 | 30 | /// 31 | /// Calculates a comparison between two vectors asynchronously 32 | /// 33 | /// 34 | /// 35 | /// 36 | /// 37 | Task CalculateAsync(float[] vectorA, float[] vectorB); 38 | 39 | /// 40 | /// Sorts the results of a comparison asynchronously 41 | /// 42 | /// 43 | /// 44 | /// 45 | Task>> SortAsync(IEnumerable> results); 46 | } 47 | 48 | -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector/Vocabulary/DictionaryVocabularyStore.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Concurrent; 2 | using System.Text.Json; 3 | 4 | namespace Build5Nines.SharpVector.Vocabulary; 5 | 6 | /// 7 | /// A thread safe simple in-memory database for storing and querying vectorized text items. 8 | /// 9 | /// 10 | public class DictionaryVocabularyStore : IVocabularyStore 11 | where TKey : notnull 12 | { 13 | private ConcurrentDictionary _vocabulary; 14 | 15 | public DictionaryVocabularyStore() 16 | { 17 | _vocabulary = new ConcurrentDictionary(); 18 | } 19 | 20 | private object _lock = new object(); 21 | 22 | public void Update(IEnumerable tokens) 23 | { 24 | lock(_lock) { 25 | foreach (var token in tokens) 26 | { 27 | if (!_vocabulary.ContainsKey(token)) 28 | { 29 | _vocabulary[token] = Count; 30 | } 31 | } 32 | } 33 | } 34 | 35 | public async Task UpdateAsync(IEnumerable tokens) 36 | { 37 | await Task.Run(() => Update(tokens)); 38 | } 39 | 40 | public int Count { get => _vocabulary.Count; } 41 | 42 | public bool TryGetValue(TKey token, out int index) 43 | { 44 | return _vocabulary.TryGetValue(token, out index); 45 | } 46 | 47 | public async Task SerializeToJsonStreamAsync(Stream stream) 48 | { 49 | if (stream == null) 50 | { 51 | throw new ArgumentNullException(nameof(stream)); 52 | } 53 | await JsonSerializer.SerializeAsync>(stream, _vocabulary); 54 | } 55 | 56 | public async Task DeserializeFromJsonStreamAsync(Stream stream) 57 | { 58 | if (stream == null) 59 | { 60 | throw new ArgumentNullException(nameof(stream)); 61 | } 62 | 63 | this._vocabulary = await JsonSerializer.DeserializeAsync>(stream) ?? new ConcurrentDictionary(); 64 | } 65 | } -------------------------------------------------------------------------------- /src/SharpVectorPerformance/DiskVectorDatabasePerformance.cs: -------------------------------------------------------------------------------- 1 | namespace SharpVectorPerformance; 2 | 3 | using System.Diagnostics; 4 | using Build5Nines.SharpVector; 5 | using Build5Nines.SharpVector.Id; 6 | using Build5Nines.SharpVector.Preprocessing; 7 | using Build5Nines.SharpVector.VectorCompare; 8 | using Build5Nines.SharpVector.Vectorization; 9 | using Build5Nines.SharpVector.VectorStore; 10 | using Build5Nines.SharpVector.Vocabulary; 11 | using BenchmarkDotNet.Attributes; 12 | using BenchmarkDotNet.Running; 13 | 14 | [MemoryDiagnoser] 15 | public class DiskVectorDatabasePerformance 16 | { 17 | private BasicDiskVectorDatabase? _db; 18 | private string _rootPath = Path.Combine(Path.GetTempPath(), "SharpVectorPerf", Guid.NewGuid().ToString("N")); 19 | 20 | [GlobalSetup] 21 | public void Setup() 22 | { 23 | Directory.CreateDirectory(_rootPath); 24 | _db = new BasicDiskVectorDatabase(_rootPath); 25 | } 26 | 27 | [GlobalCleanup] 28 | public void Cleanup() 29 | { 30 | try { if (Directory.Exists(_rootPath)) Directory.Delete(_rootPath, recursive: true); } catch { } 31 | } 32 | 33 | [Params(25)] 34 | public int ItemCount; 35 | 36 | [Benchmark] 37 | public async Task AddTexts() 38 | { 39 | var indices = Enumerable.Range(0, ItemCount); 40 | await Parallel.ForEachAsync(indices, async (i, ct) => 41 | { 42 | var text = $"Sample text {i} fox {Random.Shared.Next(0, 100)}"; 43 | await _db!.AddTextAsync(text, "meta"); 44 | }); 45 | } 46 | 47 | [Benchmark] 48 | public async Task Search() 49 | { 50 | // Ensure some data 51 | if (!_db!.GetIds().Any()) 52 | { 53 | var indices = Enumerable.Range(0, 500); 54 | await Parallel.ForEachAsync(indices, async (i, ct) => 55 | { 56 | await _db.AddTextAsync($"quick brown fox {i}", null); 57 | }); 58 | } 59 | var results = await _db.SearchAsync("quick fox"); 60 | // Touch results to avoid dead-code elimination 61 | _ = results.Texts.Take(10).Count(); 62 | } 63 | 64 | [Benchmark] 65 | public void DeleteIds() 66 | { 67 | var ids = _db!.GetIds().Take(Math.Min(50, _db.GetIds().Count())).ToList(); 68 | foreach (var id in ids) 69 | { 70 | _db.DeleteText(id); 71 | } 72 | } 73 | } -------------------------------------------------------------------------------- /.github/workflows/build-release.yml: -------------------------------------------------------------------------------- 1 | name: Build Release 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | paths-ignore: 8 | - 'docs/**' 9 | - mkdocs.yml 10 | workflow_dispatch: 11 | 12 | jobs: 13 | build: 14 | runs-on: ubuntu-latest 15 | defaults: 16 | run: 17 | working-directory: src 18 | 19 | steps: 20 | - name: Checkout code 21 | uses: actions/checkout@v2 22 | 23 | - name: Setup .NET Core 24 | uses: actions/setup-dotnet@v2 25 | with: 26 | dotnet-version: '8.0.x' # Adjust the version as needed 27 | 28 | - name: Restore dependencies 29 | run: dotnet restore 30 | 31 | - name: Build 32 | run: dotnet build --configuration Release --no-restore 33 | 34 | - name: Tests 35 | run: dotnet test --configuration Release --no-build 36 | 37 | # - name: Run tests with code coverage 38 | # run: dotnet test --no-build --verbosity normal --results-directory "./TestResults/Coverage/" --collect:"XPlat Code Coverage" 39 | 40 | # - name: Upload test results artifact 41 | # uses: actions/upload-artifact@v4 42 | # with: 43 | # name: test-results 44 | # path: '**/TestResults/**' 45 | 46 | - name: Performance Test 47 | run: dotnet run --project SharpVectorPerformance --configuration Release 48 | 49 | # - name: Publish 50 | # run: dotnet publish --configuration Release --output ./publish --no-build 51 | 52 | # - name: Upload artifact 53 | # uses: actions/upload-artifact@v4 54 | # with: 55 | # name: release-build 56 | # path: ./publish 57 | 58 | - name: Performance Results 59 | run: | 60 | echo "## Performance Results" > $GITHUB_STEP_SUMMARY 61 | cat ./BenchmarkDotNet.Artifacts/results/SharpVectorPerformance.MemoryVectorDatabasePerformance-report-github.md >> $GITHUB_STEP_SUMMARY 62 | 63 | - name: Upload Performance artifact 64 | uses: actions/upload-artifact@v4 65 | with: 66 | name: performance-results 67 | path: './src/BenchmarkDotNet.Artifacts/*' 68 | 69 | - name: Upload Nuget artifact 70 | uses: actions/upload-artifact@v4 71 | with: 72 | name: nuget-package 73 | path: '**/*.nupkg' -------------------------------------------------------------------------------- /.github/workflows/ghpages-mkdocs.yml: -------------------------------------------------------------------------------- 1 | name: Build MKDocs Site 2 | 3 | on: 4 | pull_request: 5 | branches: 6 | - main 7 | - dev 8 | paths: 9 | - .github/workflows/ghpages-mkdocs.yml 10 | - docs/** 11 | - mkdocs.yml 12 | paths-ignore: 13 | - .github/** 14 | workflow_dispatch: 15 | 16 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages 17 | permissions: 18 | contents: read 19 | pages: write 20 | id-token: write 21 | 22 | # Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued. 23 | # However, do NOT cancel in-progress runs as we want to allow these production deployments to complete. 24 | concurrency: 25 | group: "pages" 26 | cancel-in-progress: false 27 | 28 | jobs: 29 | build: 30 | runs-on: ubuntu-latest 31 | 32 | steps: 33 | - name: Checkout code 34 | uses: actions/checkout@v3 35 | with: 36 | fetch-depth: 0 # Fetch all history for all branches and tags, not just the default branch. 37 | # This is needed to ensure that the commit SHA is available for the deployment. 38 | # See 39 | sparse-checkout: | 40 | docs 41 | mkdocs.yml 42 | .github/workflows/ghpages-mkdocs.yml 43 | 44 | - name: Setup pages 45 | id: pages 46 | uses: actions/configure-pages@v5 47 | 48 | 49 | - name: Set up Python 50 | uses: actions/setup-python@v4 51 | with: 52 | python-version: '3.x' # Use the latest version of Python 3 53 | 54 | - name: Install dependencies 55 | run: | 56 | python -m pip install --upgrade pip 57 | pip install -r ./docs/requirements.txt 58 | 59 | - name: Build documentation 60 | # Outputs to the './_site' directory by default 61 | run: | 62 | mkdocs build --site-dir ./_site --config-file ./docs/mkdocs.yml 63 | 64 | - name: Upload artifact 65 | # Automatically upload an artifact from the './_site' directory by default 66 | uses: actions/upload-pages-artifact@v3 67 | with: 68 | path: ./docs/_site 69 | -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector.Playground/Components/Layout/MainLayout.razor.css: -------------------------------------------------------------------------------- 1 | .page { 2 | position: relative; 3 | display: flex; 4 | flex-direction: column; 5 | } 6 | 7 | main { 8 | flex: 1; 9 | } 10 | 11 | .sidebar { 12 | background-image: linear-gradient(180deg, rgb(5, 39, 103) 0%, #3a0647 70%); 13 | } 14 | 15 | .top-row { 16 | background-color: #f7f7f7; 17 | border-bottom: 0.1em solid #d6d5d5; 18 | padding-left: 1em; 19 | height: 3.5rem; 20 | display: flex; 21 | align-items: center; 22 | } 23 | 24 | .top-row ::deep a, .top-row ::deep .btn-link { 25 | white-space: nowrap; 26 | margin-left: 1.5rem; 27 | text-decoration: none; 28 | } 29 | 30 | .top-row ::deep a:hover, .top-row ::deep .btn-link:hover { 31 | text-decoration: underline; 32 | } 33 | 34 | .top-row ::deep a:first-child { 35 | overflow: hidden; 36 | text-overflow: ellipsis; 37 | } 38 | 39 | @media (max-width: 640.98px) { 40 | .top-row { 41 | justify-content: space-between; 42 | } 43 | 44 | .top-row ::deep a, .top-row ::deep .btn-link { 45 | margin-left: 0; 46 | } 47 | } 48 | 49 | @media (min-width: 641px) { 50 | .page { 51 | flex-direction: row; 52 | } 53 | 54 | .sidebar { 55 | width: 250px; 56 | height: 100vh; 57 | position: sticky; 58 | top: 0; 59 | } 60 | 61 | .top-row { 62 | position: sticky; 63 | top: 0; 64 | z-index: 1; 65 | } 66 | 67 | .top-row.auth ::deep a:first-child { 68 | flex: 1; 69 | text-align: right; 70 | width: 0; 71 | } 72 | 73 | .top-row, article { 74 | padding-left: 2rem !important; 75 | padding-right: 1.5rem !important; 76 | } 77 | } 78 | 79 | #blazor-error-ui { 80 | background: lightyellow; 81 | bottom: 0; 82 | box-shadow: 0 -1px 2px rgba(0, 0, 0, 0.2); 83 | display: none; 84 | left: 0; 85 | padding: 0.6rem 1.25rem 0.7rem 1.25rem; 86 | position: fixed; 87 | width: 100%; 88 | z-index: 1000; 89 | } 90 | 91 | #blazor-error-ui .dismiss { 92 | cursor: pointer; 93 | position: absolute; 94 | right: 0.75rem; 95 | top: 0.5rem; 96 | } 97 | -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector/DatabaseFileException.cs: -------------------------------------------------------------------------------- 1 | namespace Build5Nines.SharpVector; 2 | 3 | public class DatabaseFileException : Exception 4 | { 5 | public DatabaseFileException() 6 | { 7 | } 8 | 9 | public DatabaseFileException(string message) 10 | : base(message) 11 | { 12 | } 13 | 14 | public DatabaseFileException(string message, Exception innerException) 15 | : base(message, innerException) 16 | { 17 | } 18 | } 19 | 20 | public class DatabaseFileInfoException : DatabaseFileException 21 | { 22 | public DatabaseFileInfoException() 23 | { 24 | } 25 | 26 | public DatabaseFileInfoException(string message) 27 | : base(message) 28 | { 29 | } 30 | 31 | public DatabaseFileInfoException(string message, Exception innerException) 32 | : base(message, innerException) 33 | { 34 | } 35 | } 36 | 37 | public class DatabaseFileSchemaException : DatabaseFileException 38 | { 39 | public DatabaseFileSchemaException() 40 | { 41 | } 42 | 43 | public DatabaseFileSchemaException(string message) 44 | : base(message) 45 | { 46 | } 47 | 48 | public DatabaseFileSchemaException(string message, Exception innerException) 49 | : base(message, innerException) 50 | { 51 | } 52 | } 53 | 54 | public class DatabaseFileVersionException : DatabaseFileException 55 | { 56 | public DatabaseFileVersionException() 57 | { 58 | } 59 | 60 | public DatabaseFileVersionException(string message) 61 | : base(message) 62 | { 63 | } 64 | 65 | public DatabaseFileVersionException(string message, Exception innerException) 66 | : base(message, innerException) 67 | { 68 | } 69 | } 70 | 71 | public class DatabaseFileClassTypeException : DatabaseFileException 72 | { 73 | public DatabaseFileClassTypeException() 74 | { 75 | } 76 | 77 | public DatabaseFileClassTypeException(string message) 78 | : base(message) 79 | { 80 | } 81 | 82 | public DatabaseFileClassTypeException(string message, Exception innerException) 83 | : base(message, innerException) 84 | { 85 | } 86 | } 87 | 88 | public class DatabaseFileMissingEntryException : DatabaseFileException 89 | { 90 | public DatabaseFileMissingEntryException(string message, string missingEntry) 91 | : base(message) 92 | { 93 | MissingEntry = missingEntry; 94 | } 95 | 96 | public string MissingEntry { get; private set; } 97 | } -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector/VectorCompare/EuclideanDistanceVectorComparerAsync.cs: -------------------------------------------------------------------------------- 1 | using System.Runtime.InteropServices; 2 | 3 | namespace Build5Nines.SharpVector.VectorCompare; 4 | 5 | public class EuclideanDistanceVectorComparer : IVectorComparer 6 | { 7 | /// 8 | /// Calculates the Euclidean distance between two vectors. 9 | /// 10 | /// 11 | /// 12 | /// 13 | /// 14 | public async Task CalculateAsync(float[] vectorA, float[] vectorB) 15 | { 16 | return await Task.Run(() => Calculate(vectorA, vectorB)); 17 | } 18 | 19 | /// 20 | /// Calculates the Euclidean distance between two vectors. 21 | /// 22 | /// 23 | /// 24 | /// 25 | /// 26 | public float Calculate(float[] vectorA, float[] vectorB) 27 | { 28 | if (vectorA.Length != vectorB.Length) 29 | { 30 | throw new ArgumentException("Vectors must be of the same length."); 31 | } 32 | 33 | float sumOfSquares = 0f; 34 | 35 | for (int i = 0; i < vectorA.Length; i++) 36 | { 37 | float difference = vectorA[i] - vectorB[i]; 38 | sumOfSquares += difference * difference; 39 | } 40 | 41 | return (float)Math.Sqrt(sumOfSquares); 42 | } 43 | 44 | public IEnumerable> Sort(IEnumerable> results) 45 | { 46 | return results.OrderBy(s => s.Similarity); 47 | } 48 | 49 | public async Task>> SortAsync(IEnumerable> results) 50 | { 51 | return await Task.Run(() => Sort(results)); 52 | } 53 | 54 | public bool IsWithinThreshold(float? threshold, float vectorComparisonValue) 55 | { 56 | if (threshold == null) 57 | { 58 | return true; 59 | } 60 | var thresholdToCompare = threshold ?? (float)0.0f; 61 | var thresholdIsEqual = Math.Abs(vectorComparisonValue - thresholdToCompare) < 1e-6f; // epsilon; 62 | return thresholdIsEqual || vectorComparisonValue < thresholdToCompare; 63 | } 64 | } -------------------------------------------------------------------------------- /.github/workflows/mkdocs-build.yml: -------------------------------------------------------------------------------- 1 | name: Deploy MKDocs Site to Pages 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | paths: 8 | - .github/workflows/ghpages-mkdocs.yml 9 | - docs/** 10 | - mkdocs.yml 11 | workflow_dispatch: 12 | 13 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages 14 | permissions: 15 | contents: read 16 | pages: write 17 | id-token: write 18 | 19 | # Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued. 20 | # However, do NOT cancel in-progress runs as we want to allow these production deployments to complete. 21 | concurrency: 22 | group: "pages" 23 | cancel-in-progress: false 24 | 25 | jobs: 26 | build: 27 | runs-on: ubuntu-latest 28 | 29 | steps: 30 | - name: Checkout code 31 | uses: actions/checkout@v3 32 | with: 33 | fetch-depth: 0 # Fetch all history for all branches and tags, not just the default branch. 34 | # This is needed to ensure that the commit SHA is available for the deployment. 35 | # See 36 | sparse-checkout: | 37 | docs 38 | mkdocs.yml 39 | .github/workflows/ghpages-mkdocs.yml 40 | 41 | - name: Setup pages 42 | id: pages 43 | uses: actions/configure-pages@v5 44 | 45 | 46 | - name: Set up Python 47 | uses: actions/setup-python@v4 48 | with: 49 | python-version: '3.x' # Use the latest version of Python 3 50 | 51 | - name: Install dependencies 52 | run: | 53 | python -m pip install --upgrade pip 54 | pip install -r ./docs/requirements.txt 55 | 56 | - name: Build documentation 57 | # Outputs to the './_site' directory by default 58 | run: | 59 | mkdocs build --site-dir ./_site --config-file ./docs/mkdocs.yml 60 | 61 | - name: Upload artifact 62 | # Automatically upload an artifact from the './_site' directory by default 63 | uses: actions/upload-pages-artifact@v3 64 | with: 65 | path: ./docs/_site 66 | 67 | deploy: 68 | runs-on: ubuntu-latest 69 | needs: build 70 | environment: 71 | name: github-pages 72 | url: ${{ steps.deployment.outputs.page_url }} 73 | steps: 74 | - name: Deploy to GitHub Pages 75 | id: deployment 76 | uses: actions/deploy-pages@v4 77 | -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector.Ollama/Embeddings/OllamaEmbeddingsGenerator.cs: -------------------------------------------------------------------------------- 1 | using System.Text; 2 | using System.Text.Json; 3 | using System.Text.Json.Serialization; 4 | using Build5Nines.SharpVector.Embeddings; 5 | 6 | namespace Build5Nines.SharpVector.Ollama.Embeddings; 7 | 8 | public class OllamaEmbeddingsGenerator : IEmbeddingsGenerator 9 | { 10 | public string Model { get; set; } 11 | 12 | public string Endpoint { get; set; } 13 | 14 | /// 15 | /// Creates a new instance of the class. 16 | /// This constructor uses the default Ollama embeddings endpoint URL. 17 | /// 18 | /// Ollama embeddings model 19 | public OllamaEmbeddingsGenerator(string model) 20 | : this("http://localhost:11434/api/embeddings", model) 21 | { } 22 | 23 | /// 24 | /// Creates a new instance of the class. 25 | /// 26 | /// Ollama embeddings endpoint URL. 27 | /// Ollama embeddings model 28 | public OllamaEmbeddingsGenerator(string ollamaEndpoint, string model) 29 | { 30 | Endpoint = ollamaEndpoint; 31 | Model = model; 32 | } 33 | 34 | /// 35 | /// Generates embeddings for the given text using the specified Ollama model. 36 | /// 37 | /// The text to generate embeddings for. 38 | /// An array of floats representing the generated embeddings. 39 | public async Task GenerateEmbeddingsAsync(string text) 40 | { 41 | var requestBody = new 42 | { 43 | model = Model, 44 | prompt = text 45 | }; 46 | 47 | var json = JsonSerializer.Serialize(requestBody); 48 | var content = new StringContent(json, Encoding.UTF8, "application/json"); 49 | 50 | var httpClient = new HttpClient(); 51 | var response = await httpClient.PostAsync(Endpoint, content); 52 | response.EnsureSuccessStatusCode(); 53 | 54 | var responseString = await response.Content.ReadAsStringAsync(); 55 | var embeddingResponse = JsonSerializer.Deserialize(responseString); 56 | 57 | return embeddingResponse?.Embedding ?? Array.Empty(); 58 | } 59 | 60 | private class OllamaEmbeddingResponse 61 | { 62 | [JsonPropertyName("embedding")] 63 | public float[]? Embedding { get; set; } 64 | } 65 | } -------------------------------------------------------------------------------- /src/SharpVectorTest/BatchAddTests.cs: -------------------------------------------------------------------------------- 1 | namespace SharpVectorTest; 2 | 3 | using System.Linq; 4 | using System.Threading.Tasks; 5 | using Build5Nines.SharpVector; 6 | using Build5Nines.SharpVector.Embeddings; 7 | using Build5Nines.SharpVector.Id; 8 | using Build5Nines.SharpVector.VectorCompare; 9 | using Build5Nines.SharpVector.VectorStore; 10 | 11 | [TestClass] 12 | public class BatchAddTests 13 | { 14 | [TestMethod] 15 | public async Task AddTextsAsync_UsesBatchEmbeddings_WhenAvailable() 16 | { 17 | var db = new BatchMockMemoryVectorDatabase(); 18 | 19 | var inputs = new (string text, string? metadata)[] 20 | { 21 | ("one", "m1"), 22 | ("two", "m2"), 23 | ("three", "m3") 24 | }; 25 | 26 | var ids = await db.AddTextsAsync(inputs); 27 | 28 | Assert.AreEqual(3, ids.Count); 29 | 30 | var results = db.Search("one"); 31 | Assert.AreEqual(3, results.Texts.Count()); 32 | 33 | // Ensure vectors were assigned from batch generator (length = 5 per mock) 34 | foreach (var item in db) 35 | { 36 | Assert.AreEqual(5, item.Vector.Length); 37 | } 38 | } 39 | } 40 | 41 | public class BatchMockMemoryVectorDatabase 42 | : MemoryVectorDatabaseBase< 43 | int, 44 | string, 45 | MemoryDictionaryVectorStore, 46 | IntIdGenerator, 47 | CosineSimilarityVectorComparer 48 | > 49 | { 50 | public BatchMockMemoryVectorDatabase() 51 | : base( 52 | new MockBatchEmbeddingsGenerator(), 53 | new MemoryDictionaryVectorStore() 54 | ) 55 | { } 56 | } 57 | 58 | public class MockBatchEmbeddingsGenerator : IEmbeddingsGenerator, IBatchEmbeddingsGenerator 59 | { 60 | #pragma warning disable CS1998 // Async method lacks 'await' operators and will run synchronously 61 | public async Task GenerateEmbeddingsAsync(string text) 62 | #pragma warning restore CS1998 // Async method lacks 'await' operators and will run synchronously 63 | { 64 | return new float[] { 0.1f, 0.2f, 0.3f, 0.4f, 0.5f }; 65 | } 66 | 67 | #pragma warning disable CS1998 // Async method lacks 'await' operators and will run synchronously 68 | public async Task> GenerateEmbeddingsAsync(IEnumerable texts) 69 | #pragma warning restore CS1998 // Async method lacks 'await' operators and will run synchronously 70 | { 71 | // Return a different first value to ensure we can recognize batched path if needed 72 | return texts.Select((t, idx) => new float[] { 0.9f, 0.2f, 0.3f, 0.4f, 0.5f }).ToList(); 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /src/SharpVectorTest/DiskVectorDatabaseTests.cs: -------------------------------------------------------------------------------- 1 | namespace SharpVectorTest; 2 | 3 | using System; 4 | using System.IO; 5 | using System.Linq; 6 | using System.Threading.Tasks; 7 | using Build5Nines.SharpVector; 8 | using Microsoft.VisualStudio.TestTools.UnitTesting; 9 | 10 | [TestClass] 11 | public class DiskVectorDatabaseTests 12 | { 13 | private static string CreateTempDir() 14 | { 15 | var dir = Path.Combine(Path.GetTempPath(), "SharpVectorTests", Guid.NewGuid().ToString("N")); 16 | Directory.CreateDirectory(dir); 17 | return dir; 18 | } 19 | 20 | [TestMethod] 21 | public async Task AddAndGetText_PersistsToDisk() 22 | { 23 | var root = CreateTempDir(); 24 | var db = new BasicDiskVectorDatabase(root); 25 | 26 | var id = await db.AddTextAsync("hello world", "meta1"); 27 | var item = db.GetText(id); 28 | Assert.AreEqual("hello world", item.Text); 29 | Assert.AreEqual("meta1", item.Metadata); 30 | 31 | // Recreate DB and ensure data is still there 32 | var db2 = new BasicDiskVectorDatabase(root); 33 | var item2 = db2.GetText(id); 34 | Assert.AreEqual("hello world", item2.Text); 35 | Assert.AreEqual("meta1", item2.Metadata); 36 | } 37 | 38 | [TestMethod] 39 | public async Task Search_ReturnsSimilarResults() 40 | { 41 | var root = CreateTempDir(); 42 | var db = new BasicDiskVectorDatabase(root); 43 | 44 | await db.AddTextAsync("The quick brown fox", "a"); 45 | await db.AddTextAsync("Jumps over the lazy dog", "b"); 46 | await db.AddTextAsync("An unrelated sentence", "c"); 47 | 48 | var results = await db.SearchAsync("quick fox", threshold: null, pageIndex: 0, pageCount: null); 49 | Assert.IsTrue(results.Texts.Any()); 50 | Assert.IsTrue(results.Texts.Any(r => r.Text.Contains("quick", StringComparison.OrdinalIgnoreCase))); 51 | } 52 | 53 | [TestMethod] 54 | public async Task Delete_RemovesFromIndexButKeepsFile() 55 | { 56 | var root = CreateTempDir(); 57 | var db = new BasicDiskVectorDatabase(root); 58 | var id = await db.AddTextAsync("to be deleted", "m"); 59 | var existing = db.GetText(id); 60 | Assert.AreEqual("to be deleted", existing.Text); 61 | 62 | db.DeleteText(id); 63 | Assert.IsFalse(db.GetIds().Contains(id)); 64 | Assert.ThrowsException(() => db.GetText(id)); 65 | 66 | // Reopen and ensure deletion persists 67 | var db2 = new BasicDiskVectorDatabase(root); 68 | Assert.IsFalse(db2.GetIds().Contains(id)); 69 | Assert.ThrowsException(() => db2.GetText(id)); 70 | } 71 | } -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector.Playground/wwwroot/app.css: -------------------------------------------------------------------------------- 1 | html, body { 2 | font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; 3 | } 4 | 5 | a, .btn-link { 6 | color: #006bb7; 7 | } 8 | 9 | .btn-primary { 10 | color: #fff; 11 | background-color: #1b6ec2; 12 | border-color: #1861ac; 13 | } 14 | 15 | .btn:focus, .btn:active:focus, .btn-link.nav-link:focus, .form-control:focus, .form-check-input:focus { 16 | box-shadow: 0 0 0 0.1rem white, 0 0 0 0.25rem #258cfb; 17 | } 18 | 19 | .content { 20 | padding-top: 1.1rem; 21 | } 22 | 23 | h1:focus { 24 | outline: none; 25 | } 26 | 27 | .valid.modified:not([type=checkbox]) { 28 | outline: 1px solid #26b050; 29 | } 30 | 31 | .invalid { 32 | outline: 1px solid #e50000; 33 | } 34 | 35 | .validation-message { 36 | color: #e50000; 37 | } 38 | 39 | .blazor-error-boundary { 40 | background: url(data:image/svg+xml;base64,PHN2ZyB3aWR0aD0iNTYiIGhlaWdodD0iNDkiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyIgeG1sbnM6eGxpbms9Imh0dHA6Ly93d3cudzMub3JnLzE5OTkveGxpbmsiIG92ZXJmbG93PSJoaWRkZW4iPjxkZWZzPjxjbGlwUGF0aCBpZD0iY2xpcDAiPjxyZWN0IHg9IjIzNSIgeT0iNTEiIHdpZHRoPSI1NiIgaGVpZ2h0PSI0OSIvPjwvY2xpcFBhdGg+PC9kZWZzPjxnIGNsaXAtcGF0aD0idXJsKCNjbGlwMCkiIHRyYW5zZm9ybT0idHJhbnNsYXRlKC0yMzUgLTUxKSI+PHBhdGggZD0iTTI2My41MDYgNTFDMjY0LjcxNyA1MSAyNjUuODEzIDUxLjQ4MzcgMjY2LjYwNiA1Mi4yNjU4TDI2Ny4wNTIgNTIuNzk4NyAyNjcuNTM5IDUzLjYyODMgMjkwLjE4NSA5Mi4xODMxIDI5MC41NDUgOTIuNzk1IDI5MC42NTYgOTIuOTk2QzI5MC44NzcgOTMuNTEzIDI5MSA5NC4wODE1IDI5MSA5NC42NzgyIDI5MSA5Ny4wNjUxIDI4OS4wMzggOTkgMjg2LjYxNyA5OUwyNDAuMzgzIDk5QzIzNy45NjMgOTkgMjM2IDk3LjA2NTEgMjM2IDk0LjY3ODIgMjM2IDk0LjM3OTkgMjM2LjAzMSA5NC4wODg2IDIzNi4wODkgOTMuODA3MkwyMzYuMzM4IDkzLjAxNjIgMjM2Ljg1OCA5Mi4xMzE0IDI1OS40NzMgNTMuNjI5NCAyNTkuOTYxIDUyLjc5ODUgMjYwLjQwNyA1Mi4yNjU4QzI2MS4yIDUxLjQ4MzcgMjYyLjI5NiA1MSAyNjMuNTA2IDUxWk0yNjMuNTg2IDY2LjAxODNDMjYwLjczNyA2Ni4wMTgzIDI1OS4zMTMgNjcuMTI0NSAyNTkuMzEzIDY5LjMzNyAyNTkuMzEzIDY5LjYxMDIgMjU5LjMzMiA2OS44NjA4IDI1OS4zNzEgNzAuMDg4N0wyNjEuNzk1IDg0LjAxNjEgMjY1LjM4IDg0LjAxNjEgMjY3LjgyMSA2OS43NDc1QzI2Ny44NiA2OS43MzA5IDI2Ny44NzkgNjkuNTg3NyAyNjcuODc5IDY5LjMxNzkgMjY3Ljg3OSA2Ny4xMTgyIDI2Ni40NDggNjYuMDE4MyAyNjMuNTg2IDY2LjAxODNaTTI2My41NzYgODYuMDU0N0MyNjEuMDQ5IDg2LjA1NDcgMjU5Ljc4NiA4Ny4zMDA1IDI1OS43ODYgODkuNzkyMSAyNTkuNzg2IDkyLjI4MzcgMjYxLjA0OSA5My41Mjk1IDI2My41NzYgOTMuNTI5NSAyNjYuMTE2IDkzLjUyOTUgMjY3LjM4NyA5Mi4yODM3IDI2Ny4zODcgODkuNzkyMSAyNjcuMzg3IDg3LjMwMDUgMjY2LjExNiA4Ni4wNTQ3IDI2My41NzYgODYuMDU0N1oiIGZpbGw9IiNGRkU1MDAiIGZpbGwtcnVsZT0iZXZlbm9kZCIvPjwvZz48L3N2Zz4=) no-repeat 1rem/1.8rem, #b32121; 41 | padding: 1rem 1rem 1rem 3.7rem; 42 | color: white; 43 | } 44 | 45 | .blazor-error-boundary::after { 46 | content: "An error has occurred." 47 | } 48 | 49 | .darker-border-checkbox.form-check-input { 50 | border-color: #929292; 51 | } 52 | 53 | 54 | #playgroundEditor { 55 | height: 10em; 56 | } -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector/VectorStore/IVectorStore.cs: -------------------------------------------------------------------------------- 1 | using System.Collections; 2 | using System.Runtime.Serialization; 3 | 4 | namespace Build5Nines.SharpVector.VectorStore; 5 | 6 | /// 7 | /// Interface for a vector store. 8 | /// 9 | /// 10 | /// 11 | /// 12 | public interface IVectorStore 13 | : IEnumerable>>, 14 | IReadOnlyCollection>>, 15 | IEnumerable, 16 | IAsyncEnumerable>> 17 | { 18 | /// 19 | /// Retrieves a text and metadata by its ID 20 | /// 21 | /// 22 | /// 23 | /// 24 | IVectorTextItem Get(TId id); 25 | 26 | /// 27 | /// Gets all the Ids for every text. 28 | /// 29 | /// 30 | public IEnumerable GetIds(); 31 | 32 | /// 33 | /// Retrieves a text and metadata by its ID 34 | /// 35 | /// 36 | /// 37 | /// 38 | void Set(TId id, VectorTextItem item); 39 | 40 | /// 41 | /// Retrieves a text and metadata by its ID asynchronously 42 | /// 43 | /// 44 | /// 45 | /// 46 | Task SetAsync(TId id, VectorTextItem item); 47 | 48 | /// 49 | /// Deletes a text by its ID 50 | /// 51 | /// 52 | /// The removed text item 53 | /// 54 | IVectorTextItem Delete(TId id); 55 | 56 | /// 57 | /// Checks if the database contains a key 58 | /// 59 | /// 60 | /// 61 | bool ContainsKey(TId id); 62 | 63 | /// 64 | /// Serializes the Vector Store to a JSON stream 65 | /// 66 | /// 67 | /// 68 | Task SerializeToJsonStreamAsync(Stream stream); 69 | 70 | /// 71 | /// Deserializes the Vector Store from a JSON stream 72 | /// 73 | /// 74 | /// 75 | Task DeserializeFromJsonStreamAsync(Stream stream); 76 | } 77 | 78 | public interface IVectorStore : IVectorStore 79 | { } 80 | -------------------------------------------------------------------------------- /src/.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | // Use IntelliSense to learn about possible attributes. 3 | // Hover to view descriptions of existing attributes. 4 | // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 5 | "version": "0.2.0", 6 | "configurations": [ 7 | { 8 | "name": ".NET Core Launch (console)", 9 | "type": "coreclr", 10 | "request": "launch", 11 | "WARNING01": "*********************************************************************************", 12 | "WARNING02": "The C# extension was unable to automatically decode projects in the current", 13 | "WARNING03": "workspace to create a runnable launch.json file. A template launch.json file has", 14 | "WARNING04": "been created as a placeholder.", 15 | "WARNING05": "", 16 | "WARNING06": "If the server is currently unable to load your project, you can attempt to", 17 | "WARNING07": "resolve this by restoring any missing project dependencies (example: run 'dotnet", 18 | "WARNING08": "restore') and by fixing any reported errors from building the projects in your", 19 | "WARNING09": "workspace.", 20 | "WARNING10": "If this allows the server to now load your project then --", 21 | "WARNING11": " * Delete this file", 22 | "WARNING12": " * Open the Visual Studio Code command palette (View->Command Palette)", 23 | "WARNING13": " * run the command: '.NET: Generate Assets for Build and Debug'.", 24 | "WARNING14": "", 25 | "WARNING15": "If your project requires a more complex launch configuration, you may wish to", 26 | "WARNING16": "delete this configuration and pick a different template using the 'Add", 27 | "WARNING17": "Configuration...' button at the bottom of this file.", 28 | "WARNING18": "*********************************************************************************", 29 | "preLaunchTask": "build", 30 | "program": "${workspaceFolder}/ConsoleTest/bin/Debug/net8.0/ConsoleTest.dll", 31 | "args": [], 32 | "cwd": "${workspaceFolder}", 33 | "console": "internalConsole", 34 | "stopAtEntry": false 35 | }, 36 | { 37 | "name": ".NET Core Attach", 38 | "type": "coreclr", 39 | "request": "attach" 40 | }, 41 | { 42 | "name": ".NET Tests", 43 | "type": "coreclr", 44 | "request": "launch", 45 | "preLaunchTask": "build", 46 | "program": "${workspaceFolder}/SharpVectorTest/bin/Debug/net8.0/SharpVectorTest.dll", 47 | "args": [], 48 | "cwd": "${workspaceFolder}", 49 | "stopAtEntry": false, 50 | "console": "internalConsole" 51 | } 52 | ] 53 | } -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector/Vectorization/BagOfWordsVectorizer.cs: -------------------------------------------------------------------------------- 1 | namespace Build5Nines.SharpVector.Vectorization; 2 | 3 | using Build5Nines.SharpVector.Vocabulary; 4 | 5 | /// 6 | /// A class that vectorizes a collection of tokens 7 | /// 8 | /// 9 | /// 10 | public class BagOfWordsVectorizer : IVectorizer 11 | where TVocabularyKey : notnull 12 | where TVocabularyValue : notnull 13 | { 14 | public async Task GenerateVectorFromTokensAsync(IVocabularyStore vocabularyStore, IEnumerable tokens) 15 | { 16 | return await Task.Run(() => GenerateVectorFromTokens(vocabularyStore, tokens)); 17 | } 18 | 19 | /// 20 | /// Generates vectors from tokens using the vocabulary. 21 | /// 22 | /// The vocabulary store to use for vectorization 23 | /// The tokens to generate a vector from 24 | /// 25 | public float[] GenerateVectorFromTokens(IVocabularyStore vocabularyStore, IEnumerable tokens) 26 | { 27 | dynamic count = vocabularyStore.Count; 28 | var vector = new float[count]; 29 | 30 | foreach (var token in tokens) 31 | { 32 | if (vocabularyStore.TryGetValue(token, out var index)) 33 | { 34 | vector[index]++; 35 | } 36 | } 37 | 38 | return vector; 39 | } 40 | 41 | /// 42 | /// Method to normalize vectors to a specific length by padding or truncating 43 | /// 44 | /// 45 | /// 46 | /// 47 | public float[] NormalizeVector(float[] vector, TVocabularyValue length) 48 | { 49 | var intLength = Convert.ToInt32(length); 50 | float[] normalizedVector = new float[intLength]; 51 | Array.Copy(vector, normalizedVector, (long)Math.Min(vector.Length, intLength)); 52 | 53 | // Normalize the vector 54 | float magnitude = (float)Math.Sqrt(normalizedVector.Sum(v => v * v)); 55 | if (magnitude > 0) 56 | { 57 | for (int i = 0; i < normalizedVector.Length; i++) 58 | { 59 | normalizedVector[i] /= magnitude; 60 | } 61 | } 62 | // else 63 | // { 64 | // // If magnitude is zero, return the vector as it is 65 | // // or handle it as per your requirement 66 | // // For example, you can use a small value to avoid division by zero 67 | // for (int i = 0; i < normalizedVector.Length; i++) 68 | // { 69 | // //normalizedVector[i] = 0; // or 70 | // normalizedVector[i] = 1e-10f; 71 | // } 72 | // } 73 | 74 | return normalizedVector; 75 | } 76 | } -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector/MemoryVectorDatabase.cs: -------------------------------------------------------------------------------- 1 | using Build5Nines.SharpVector.Vocabulary; 2 | using Build5Nines.SharpVector.Id; 3 | using Build5Nines.SharpVector.Preprocessing; 4 | using Build5Nines.SharpVector.Vectorization; 5 | using Build5Nines.SharpVector.VectorCompare; 6 | using Build5Nines.SharpVector.VectorStore; 7 | 8 | namespace Build5Nines.SharpVector; 9 | 10 | public interface IMemoryVectorDatabase : IVectorDatabase 11 | where TId : notnull 12 | { } 13 | 14 | /// 15 | /// A simple in-memory database for storing and querying vectorized text items. 16 | /// This database uses a Bag of Words vectorization strategy, with Cosine similarity, a dictionary vocabulary store, and a basic text preprocessor. 17 | /// 18 | /// Defines the data type for the Metadata stored with the Text. 19 | public class MemoryVectorDatabase 20 | : MemoryVectorDatabaseBase< 21 | int, 22 | TMetadata, 23 | MemoryDictionaryVectorStoreWithVocabulary, string, int>, 24 | DictionaryVocabularyStore, 25 | string, int, 26 | IntIdGenerator, 27 | BasicTextPreprocessor, 28 | BagOfWordsVectorizer, 29 | CosineSimilarityVectorComparer 30 | >, IMemoryVectorDatabase, IVectorDatabase 31 | { 32 | public MemoryVectorDatabase() 33 | : base( 34 | new MemoryDictionaryVectorStoreWithVocabulary, string, int>( 35 | new DictionaryVocabularyStore() 36 | ) 37 | ) 38 | { } 39 | 40 | 41 | [Obsolete("Use DeserializeFromBinaryStreamAsync instead.")] 42 | public override async Task DeserializeFromJsonStreamAsync(Stream stream) 43 | { 44 | await DeserializeFromBinaryStreamAsync(stream); 45 | } 46 | 47 | [Obsolete("Use DeserializeFromBinaryStream instead.")] 48 | public override void DeserializeFromJsonStream(Stream stream) 49 | { 50 | DeserializeFromBinaryStream(stream); 51 | } 52 | 53 | /// 54 | /// Deserializes the database from a binary stream. 55 | /// 56 | /// 57 | /// 58 | public override async Task DeserializeFromBinaryStreamAsync(Stream stream) 59 | { 60 | await base.DeserializeFromBinaryStreamAsync(stream); 61 | 62 | // Re-initialize the IdGenerator with the max Id value from the VectorStore 63 | _idGenerator = new IntIdGenerator(VectorStore.GetIds().Max()); 64 | } 65 | 66 | /// 67 | /// Deserializes the database from a binary stream. 68 | /// 69 | /// 70 | public override void DeserializeFromBinaryStream(Stream stream) 71 | { 72 | base.DeserializeFromBinaryStream(stream); 73 | 74 | // Re-initialize the IdGenerator with the max Id value from the VectorStore 75 | _idGenerator = new IntIdGenerator(VectorStore.GetIds().Max()); 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector/MemoryVectorDatabaseBase.cs: -------------------------------------------------------------------------------- 1 | using Build5Nines.SharpVector.Id; 2 | using Build5Nines.SharpVector.Preprocessing; 3 | using Build5Nines.SharpVector.Vocabulary; 4 | using Build5Nines.SharpVector.Vectorization; 5 | using Build5Nines.SharpVector.VectorCompare; 6 | using Build5Nines.SharpVector.VectorStore; 7 | using System.Collections.Concurrent; 8 | using System.IO.Compression; 9 | using System.Runtime.CompilerServices; 10 | using System.Text.Json; 11 | using Build5Nines.SharpVector.Embeddings; 12 | using System.Runtime.ExceptionServices; 13 | using System.Collections; 14 | using System.Linq; 15 | 16 | namespace Build5Nines.SharpVector; 17 | 18 | /// 19 | /// Base class for a memory vector database. 20 | /// 21 | /// 22 | /// 23 | /// 24 | /// 25 | /// 26 | /// 27 | /// 28 | /// 29 | /// 30 | /// 31 | public abstract class MemoryVectorDatabaseBase 32 | : VectorDatabaseBase 33 | where TId : notnull 34 | where TVocabularyKey : notnull 35 | where TVocabularyValue: notnull 36 | where TVectorStore : IVectorStoreWithVocabulary 37 | where TVocabularyStore : IVocabularyStore 38 | where TIdGenerator : IIdGenerator, new() 39 | where TTextPreprocessor : ITextPreprocessor, new() 40 | where TVectorizer : IVectorizer, new() 41 | where TVectorComparer : IVectorComparer, new() 42 | { 43 | protected MemoryVectorDatabaseBase(TVectorStore vectorStore) 44 | : base(vectorStore) 45 | { } 46 | } 47 | 48 | /// 49 | /// Base class for a memory vector database. 50 | /// 51 | /// 52 | /// 53 | /// 54 | /// 55 | /// 56 | public abstract class MemoryVectorDatabaseBase 57 | : VectorDatabaseBase, IMemoryVectorDatabase 58 | where TId : notnull 59 | where TVectorStore : IVectorStore 60 | where TIdGenerator : IIdGenerator, new() 61 | where TVectorComparer : IVectorComparer, new() 62 | { 63 | public MemoryVectorDatabaseBase(IEmbeddingsGenerator embeddingsGenerator, TVectorStore vectorStore) 64 | : base(embeddingsGenerator, vectorStore) 65 | { } 66 | } 67 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Build5Nines SharpVector - The lightweight, in-memory, local, Semantic Search, Text Vector Database for any C# / .NET Applications 2 | 3 | `Build5Nines.SharpVector` is an in-memory vector database library designed for .NET applications. It allows you to store, search, and manage text data using vector representations. The library is customizable and extensible, enabling support for different vector comparison methods, preprocessing techniques, and vectorization strategies. 4 | 5 | [![Release Build](https://github.com/Build5Nines/SharpVector/actions/workflows/build-release.yml/badge.svg)](https://github.com/Build5Nines/SharpVector/actions/workflows/build-release.yml) 6 | ![Libraries.io dependency status for GitHub repo](https://img.shields.io/librariesio/github/build5nines/sharpvector) 7 | 8 | [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE) 9 | ![Framework: .NET 8+](https://img.shields.io/badge/framework-.NET%208%2B-blue) 10 | ![Semantic Search: Enabled](https://img.shields.io/badge/semantic%20search-enabled-purple) 11 | ![Gen AI: Ready](https://img.shields.io/badge/Gen%20AI-ready-purple) 12 | 13 | Vector databases are used with Semantic Search and [Generative AI](https://build5nines.com/what-is-generative-ai/?utm_source=github&utm_medium=sharpvector) solutions augmenting the LLM (Large Language Model) with the ability to load additional context data with the AI prompt using the [RAG (Retrieval-Augmented Generation)](https://build5nines.com/what-is-retrieval-augmented-generation-rag/?utm_source=github&utm_medium=sharpvector) design pattern. 14 | 15 | While there are lots of large databases that can be used to build Vector Databases (like Azure CosmosDB, PostgreSQL w/ pgvector, Azure AI Search, Elasticsearch, and more), there are not many options for a lightweight vector database that can be embedded into any .NET application to provide a local text vector database. 16 | 17 | > "For the in-memory vector database, we're using Build5Nines.SharpVector, an excellent open-source project by Chris Pietschmann. SharpVector makes it easy to store and retrieve vectorized data, making it an ideal choice for our sample RAG implementation." 18 | > 19 | > [Tulika Chaudharie, Principal Product Manager at Microsoft for Azure App Service](https://azure.github.io/AppService/2024/09/03/Phi3-vector.html) 20 | 21 | Build5Nines SharpVector is the lightweight, local, in-memory Text Vector Database for implementing semantic search into any .NET application! 22 | 23 | ### [Documentation](https://sharpvector.build5nines.com) | [Get Started](https://sharpvector.build5nines.com/get-started/) | [Samples](https://sharpvector.build5nines.com/samples/) 24 | 25 | ## Nuget Package 26 | 27 | The `Build5Nines.SharpVector` library is available as a Nuget Package to easily include into your .NET projects: 28 | 29 | ```bash 30 | dotnet add package Build5Nines.SharpVector 31 | ``` 32 | 33 | You can view it on Nuget.org here: 34 | 35 | ## Maintained By 36 | 37 | The **Build5Nines SharpVector** project is maintained by [Chris Pietschmann](https://pietschsoft.com?utm_source=github&utm_medium=sharpvector), founder of [Build5Nines](https://build5nines.com?utm_source=github&utm_medium=sharpvector), Microsoft MVP, HashiCorp Ambassador, and Microsoft Certified Trainer (MCT). 38 | -------------------------------------------------------------------------------- /docs/docs/get-started/data-management/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Data Management 3 | 4 | --- 5 | # :material-database-edit-outline: Data Management 6 | 7 | Since `Build5Nines.SharpVector` is a database, it also has data management methods available. These methods enable you to add, remove, and update the text documents that are vectorized and indexed within the semantic database. 8 | 9 | ## Get Text Item ID 10 | 11 | Every text item within a `Build5Nines.SharpVector` database is assigned a unique identifier (ID). There are a few ways to get access to the ID of the text items. 12 | 13 | === ".AddText()" 14 | 15 | When adding an individual text item to the vector database, the ID value will be returned: 16 | 17 | ```csharp 18 | var id = vdb.AddText(txt, metadata); 19 | 20 | var id = await vdb.AddTextAsync(txt, metadata); 21 | ``` 22 | 23 | === ".Search()" 24 | 25 | When you perform a semantic search, the search results will contain the list of texts; each have an ID property. 26 | 27 | ```csharp 28 | var results = vdb.Search("query text"); 29 | 30 | foreach(var text in results.Texts) { 31 | var id = text.Id; 32 | var text = text.Text; 33 | var metadata = text.Metadata; 34 | // do something here 35 | } 36 | ``` 37 | 38 | === "Enumerator" 39 | 40 | The `IVectorDatabase` classes implement `IEnumerable` so you can easily loop through all the text items that have been added to the database. 41 | 42 | ```csharp 43 | foreach(var item in vdb) { 44 | var id = item.Id; 45 | var text = item.Text; 46 | var metadata = item.Metadata; 47 | var vector = item.Vector; 48 | 49 | // do something here 50 | } 51 | ``` 52 | 53 | ## Get 54 | 55 | If you know the `id` of a Text item in the database, you can retrieve it directly. 56 | 57 | ### Get By Id 58 | 59 | The `.GetText` method can be used to retrieve a text item from the vector database directly. 60 | 61 | ```csharp 62 | vdb.GetText(id); 63 | ``` 64 | 65 | ## Update 66 | 67 | Once text items have been added to the database "Update" methods can be used to modify them. 68 | 69 | ### Update Text 70 | 71 | The `.UpdateText` method can be used to update the `Text` value, and associated vectors will be updated. 72 | 73 | ```csharp 74 | vdb.UpdateText(id, newTxt); 75 | ``` 76 | 77 | When the `Text` is updated, new vector embeddings are generated for the new text. 78 | 79 | ### Update Metadata 80 | 81 | The `.UpdateTextMetadata` method can be used to update the `Metadata` for a given text item by `Id`. 82 | 83 | ```csharp 84 | vdb.UpdateTextMetadata(id, newTxt); 85 | ``` 86 | 87 | When `Metadata` is updated, the vector embeddings are not updated. 88 | 89 | ### Update Text and Metadata 90 | 91 | The `.UpdateTextAndMetadata` method can be used to update the `Text` and `Metadata` for a text item in the database for the given text item `Id`. 92 | 93 | ```csharp 94 | vdb.UpdateTextAndMetadata(id, newTxt, newMetadata); 95 | ``` 96 | 97 | ## Delete 98 | 99 | The vector database supports the ability to delete text items. 100 | 101 | ### Delete Text 102 | 103 | The `.DeleteText` method can be used to delete a text item form the database for the given `Id'. 104 | 105 | ```csharp 106 | vdb.DeleteText(id); 107 | ``` 108 | -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector/VectorCompare/CosineSimilarityVectorComparerAsync.cs: -------------------------------------------------------------------------------- 1 | using System.Reflection; 2 | using System.Runtime.InteropServices; 3 | 4 | namespace Build5Nines.SharpVector.VectorCompare; 5 | 6 | public class CosineSimilarityVectorComparer : IVectorComparer 7 | { 8 | /// 9 | /// Calculates the cosine similarity between two vectors. 10 | /// 11 | /// Cosine Similarity is a metric used to measure how similar two vectors are. It calculates the cosine of the angle between two vectors projected in a multi-dimensional space. The result of the cosine similarity ranges from -1 to 1. 12 | /// 13 | /// 14 | /// 15 | /// 16 | /// 17 | public async Task CalculateAsync(float[] vectorA, float[] vectorB) 18 | { 19 | return await Task.Run(() => Calculate(vectorA, vectorB)); 20 | } 21 | 22 | /// 23 | /// Calculates the cosine similarity between two vectors. 24 | /// 25 | /// Cosine Similarity is a metric used to measure how similar two vectors are. It calculates the cosine of the angle between two vectors projected in a multi-dimensional space. The result of the cosine similarity ranges from -1 to 1. 26 | /// 27 | /// 28 | /// 29 | /// 30 | /// 31 | public float Calculate(float[] vectorA, float[] vectorB) 32 | { 33 | if (vectorA.Length != vectorB.Length) 34 | { 35 | throw new ArgumentException("Vectors must be of the same length."); 36 | } 37 | 38 | float dotProduct = 0; 39 | float magnitudeA = 0; 40 | float magnitudeB = 0; 41 | 42 | for (int i = 0; i < vectorA.Length; i++) 43 | { 44 | dotProduct += vectorA[i] * vectorB[i]; 45 | magnitudeA += vectorA[i] * vectorA[i]; 46 | magnitudeB += vectorB[i] * vectorB[i]; 47 | } 48 | 49 | magnitudeA = (float)Math.Sqrt(magnitudeA); 50 | magnitudeB = (float)Math.Sqrt(magnitudeB); 51 | 52 | if (magnitudeA == 0 || magnitudeB == 0) 53 | { 54 | return 0; 55 | } 56 | 57 | return dotProduct / (magnitudeA * magnitudeB); 58 | } 59 | 60 | public IEnumerable> Sort(IEnumerable> results) 61 | { 62 | return results.OrderByDescending(s => s.Similarity); 63 | } 64 | 65 | public async Task>> SortAsync(IEnumerable> results) 66 | { 67 | return await Task.Run(() => Sort(results)); 68 | } 69 | 70 | public bool IsWithinThreshold(float? threshold, float vectorComparisonValue) 71 | { 72 | if (threshold == null) 73 | { 74 | return true; 75 | } 76 | var thresholdToCompare = threshold ?? (float)0.0f; 77 | var thresholdIsEqual = Math.Abs(vectorComparisonValue - thresholdToCompare) < 1e-6f; // epsilon; 78 | return thresholdIsEqual || vectorComparisonValue > thresholdToCompare; 79 | } 80 | } -------------------------------------------------------------------------------- /docs/docs/get-started/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Get Started 3 | description: Get up and running with SharpVector in minutes. Learn how to install, initialize, and begin storing and searching vectorized text data. 4 | --- 5 | # :octicons-rocket-24: Get Started 6 | 7 | It's really easy to get started with using `Build5Nines.SharpVector`. Simply follow the below steps. 8 | 9 | ## Prerequisites 10 | 11 | Using `Build5Nines.SharpVector` requires the following: 12 | 13 | - .NET 8.0 or later 14 | 15 | ## Install Nuget Package 16 | 17 | The `Build5Nines.SharpVector` library is available as a [Nuget package](https://www.nuget.org/packages/Build5Nines.SharpVector): 18 | 19 | === ".NET CLI" 20 | ```bash 21 | dotnet add package Build5Nines.SharpVector 22 | ``` 23 | 24 | === "Package Manager" 25 | ```powershell 26 | Nuget\Install-Package Build5Nines.SharpVector 27 | ``` 28 | 29 | ## Basic example 30 | 31 | The following is a basic example of using `Build5Nines.SharpVector` to create and use an in-memory vector database within a C# application: 32 | 33 | ```csharp 34 | using Build5Nines.SharpVector; 35 | 36 | // Create a Vector Database with metadata of type string 37 | var vdb = new BasicMemoryVectorDatabase(); 38 | // The Metadata is declared using generics, so you can store whatever data you need there. 39 | 40 | // Load Vector Database with some sample text data 41 | // Text is the movie description, and Metadata is the movie title with release year in this example 42 | vdb.AddText("Iron Man (2008) is a Marvel Studios action, adventure, and sci-fi movie about Tony Stark (Robert Downey Jr.), a billionaire inventor and weapons developer who is kidnapped by terrorists and forced to build a weapon. Instead, Tony uses his ingenuity to build a high-tech suit of armor and escape, becoming the superhero Iron Man. He then returns to the United States to refine the suit and use it to fight crime and terrorism.", "Iron Man (2008)"); 43 | vdb.AddText("The Lion King is a 1994 Disney animated film about a young lion cub named Simba who is the heir to the throne of an African savanna.", "The Lion King (1994)"); 44 | vdb.AddText("Aladdin is a 2019 live-action Disney adaptation of the 1992 animated classic of the same name about a street urchin who finds a magic lamp and uses a genie's wishes to become a prince so he can marry Princess Jasmine.", "Alladin (2019)"); 45 | vdb.AddText("The Little Mermaid is a 2023 live-action adaptation of Disney's 1989 animated film of the same name. The movie is about Ariel, the youngest of King Triton's daughters, who is fascinated by the human world and falls in love with Prince Eric.", "The Little Mermaid"); 46 | vdb.AddText("Frozen is a 2013 Disney movie about a fearless optimist named Anna who sets off on a journey to find her sister Elsa, whose icy powers have trapped their kingdom in eternal winter.", "Frozen (2013)"); 47 | 48 | // Perform a Vector Search 49 | var result = vdb.Search(newPrompt, pageCount: 5); // return the first 5 results 50 | 51 | if (!result.IsEmpty) 52 | { 53 | Console.WriteLine("Similar Text Found:"); 54 | foreach (var item in result.Texts) 55 | { 56 | Console.WriteLine(item.Metadata); 57 | Console.WriteLine(item.Text); 58 | } 59 | } else { 60 | Console.WriteLine("No results found."); 61 | } 62 | ``` 63 | 64 | 🚀 You are now using an in-memory vector database to implement semantic text searching within your app! 65 | 66 | !!! info 67 | The `Build5Nines.SharpVector` library, be default, supports local text vector generation. However, there is also additional support for both [OpenAI and Ollama embeddings models](../embeddings) for using higher quality, more robust vector generation. 68 | -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector/VectorTextResult.cs: -------------------------------------------------------------------------------- 1 | 2 | namespace Build5Nines.SharpVector; 3 | 4 | using System.Collections.Generic; 5 | using System.Linq; 6 | using System.Runtime.InteropServices; 7 | 8 | /// 9 | /// Represents a result of a vector text search. 10 | /// 11 | /// The type of the identifier. 12 | /// The type of the document. 13 | /// The type of the metadata. 14 | public interface IVectorTextResult 15 | { 16 | /// 17 | /// The list of Texts found in the search results. 18 | /// 19 | IEnumerable> Texts { get; } 20 | 21 | /// 22 | /// Returns true if the search returned no results. 23 | /// 24 | bool IsEmpty { get; } 25 | 26 | /// 27 | /// The total count of Texts found in the search results. 28 | /// 29 | int TotalCount { get; } 30 | 31 | /// 32 | /// The current page index of the search results. 33 | /// 34 | public int PageIndex { get; } 35 | 36 | /// 37 | /// The total number of pages of search results. 38 | /// 39 | public int TotalPages { get; } 40 | } 41 | 42 | /// 43 | /// Represents a result of a vector text search. 44 | /// 45 | /// The type of the metadata. 46 | public interface IVectorTextResult 47 | : IVectorTextResult 48 | { } 49 | 50 | /// 51 | /// Represents a result of a vector text search. 52 | /// 53 | /// The type of the identifier. 54 | /// The type of the document. 55 | /// The type of the metadata. 56 | public class VectorTextResult 57 | : IVectorTextResult 58 | { 59 | public VectorTextResult(int totalCount, int pageIndex, int totalPages, IEnumerable> texts) 60 | { 61 | Texts = texts; 62 | TotalCount = totalCount; 63 | PageIndex = pageIndex; 64 | TotalPages = totalPages; 65 | } 66 | 67 | /// 68 | /// Returns true if the search returned no results. 69 | /// 70 | public IEnumerable> Texts { get; private set; } 71 | 72 | /// 73 | /// Returns true if the search returned no results. 74 | /// 75 | public bool IsEmpty { get => Texts == null || !Texts.Any(); } 76 | 77 | /// 78 | /// The total count of Texts found in the search results. 79 | /// 80 | public int TotalCount { get; private set; } 81 | 82 | /// 83 | /// The current page index of the search results. 84 | /// 85 | public int PageIndex { get; private set; } 86 | 87 | /// 88 | /// The total number of pages of search results. 89 | /// 90 | public int TotalPages { get; private set; } 91 | } 92 | 93 | /// 94 | /// Represents a result of a vector text search. 95 | /// 96 | /// The type of the metadata. 97 | public class VectorTextResult 98 | : VectorTextResult, IVectorTextResult 99 | { 100 | public VectorTextResult(int totalCount, int pageIndex, int totalPages, IEnumerable> texts) 101 | : base(totalCount, pageIndex, totalPages, texts) 102 | { } 103 | } -------------------------------------------------------------------------------- /src/SharpVectorTest/VectorStore/MemoryDictionaryVectorStoreTest.cs: -------------------------------------------------------------------------------- 1 | using Build5Nines.SharpVector; 2 | using Build5Nines.SharpVector.VectorStore; 3 | 4 | namespace SharpVectorTest.VectorStore; 5 | 6 | [TestClass] 7 | public class MemoryDictionaryVectorStoreTests 8 | { 9 | [TestMethod] 10 | public async Task SerializeDeserializeStream_001() 11 | { 12 | var vectorStore = new MemoryDictionaryVectorStore(); 13 | vectorStore.Set(1, new VectorTextItem("key1", "1", new float[] { 1.0F, 2.0F, 3.0F })); 14 | vectorStore.Set(2, new VectorTextItem("key2", "2", new float[] { 4.0F, 5.0F, 6.0F })); 15 | vectorStore.Set(3, new VectorTextItem("key3", "3", new float[] { 7.0F, 8.0F, 9.0F })); 16 | vectorStore.Set(4, new VectorTextItem("key4", "4", new float[] { 10.0F, 11.0F, 12.0F })); 17 | 18 | 19 | var stream = new MemoryStream(); 20 | await vectorStore.SerializeToJsonStreamAsync(stream); 21 | 22 | stream.Position = 0; // move to beginning of stream 23 | 24 | var vectorStoreTwo = new MemoryDictionaryVectorStore(); 25 | await vectorStoreTwo.DeserializeFromJsonStreamAsync(stream); 26 | 27 | Assert.AreEqual(4, vectorStoreTwo.Count()); 28 | 29 | Assert.AreEqual(3, vectorStoreTwo.Get(1).Vector.Length); 30 | Assert.AreEqual(3, vectorStoreTwo.Get(2).Vector.Length); 31 | Assert.AreEqual(3, vectorStoreTwo.Get(3).Vector.Length); 32 | Assert.AreEqual(3, vectorStoreTwo.Get(4).Vector.Length); 33 | 34 | Assert.AreEqual(1.0, vectorStoreTwo.Get(1).Vector[0]); 35 | Assert.AreEqual(2.0, vectorStoreTwo.Get(1).Vector[1]); 36 | Assert.AreEqual(3.0, vectorStoreTwo.Get(1).Vector[2]); 37 | 38 | Assert.AreEqual(4.0, vectorStoreTwo.Get(2).Vector[0]); 39 | Assert.AreEqual(5.0, vectorStoreTwo.Get(2).Vector[1]); 40 | Assert.AreEqual(6.0, vectorStoreTwo.Get(2).Vector[2]); 41 | 42 | Assert.AreEqual(7.0, vectorStoreTwo.Get(3).Vector[0]); 43 | Assert.AreEqual(8.0, vectorStoreTwo.Get(3).Vector[1]); 44 | Assert.AreEqual(9.0, vectorStoreTwo.Get(3).Vector[2]); 45 | 46 | Assert.AreEqual(10.0, vectorStoreTwo.Get(4).Vector[0]); 47 | Assert.AreEqual(11.0, vectorStoreTwo.Get(4).Vector[1]); 48 | Assert.AreEqual(12.0, vectorStoreTwo.Get(4).Vector[2]); 49 | } 50 | 51 | [TestMethod] 52 | public void MemoryVectorStore_001() 53 | { 54 | var vectorStore = new MemoryDictionaryVectorStore(); 55 | vectorStore.Set(1, new VectorTextItem("key1", "1", new float[] { 1.0F, 2.0F, 3.0F })); 56 | vectorStore.Set(2, new VectorTextItem("key2", "2", new float[] { 4.0F, 5.0F, 6.0F })); 57 | vectorStore.Set(3, new VectorTextItem("key3", "3", new float[] { 7.0F, 8.0F, 9.0F })); 58 | vectorStore.Set(4, new VectorTextItem("key4", "4", new float[] { 10.0F, 11.0F, 12.0F })); 59 | 60 | var item = vectorStore.Get(2); 61 | Assert.AreEqual("key2", item.Text); 62 | } 63 | 64 | [TestMethod] 65 | public void MemoryVectorStore_002() 66 | { 67 | var vectorStore = new MemoryDictionaryVectorStore(); 68 | vectorStore.Set(1, new VectorTextItem("key1", "1", new float[] { 1.0F, 2.0F, 3.0F })); 69 | vectorStore.Set(2, new VectorTextItem("key2", "2", new float[] { 4.0F, 5.0F, 6.0F })); 70 | vectorStore.Set(3, new VectorTextItem("key3", "3", new float[] { 7.0F, 8.0F, 9.0F })); 71 | vectorStore.Set(4, new VectorTextItem("key4", "4", new float[] { 10.0F, 11.0F, 12.0F })); 72 | 73 | foreach(var item in vectorStore) 74 | { 75 | Assert.IsNotNull(item.Value); 76 | Assert.AreNotEqual(0, item.Key); 77 | } 78 | } 79 | 80 | } -------------------------------------------------------------------------------- /docs/docs/get-started/search/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Semantic Search 3 | --- 4 | # :material-file-search: Semantic Search 5 | 6 | Once text items and their associated metadata have been added to the vector database, the database can be used for semantic search to find matching text items for a given query. 7 | 8 | The `BasicMemoryVectorDatabase` and `MemoryVectorDatabase<>` classes both contain `.Search` and `.SearchAsync` methods that can be used to perform semantic search on the database: 9 | 10 | === "Sync" 11 | 12 | ```csharp 13 | var query = "some text to search"; 14 | var results = vdb.Search(query); 15 | ``` 16 | 17 | === "Async" 18 | 19 | ```csharp 20 | var query = "some text to search"; 21 | var results = await vdb.SearchAsync(query); 22 | ``` 23 | 24 | ## Metadata Filters 25 | 26 | The `.Search` and `.SearchAsync` methods also include the ability to pre-filter the search results based on a boolean evaluation of the `Metadata` for the text item. This check is run before the vector similarity search is performed, and can help increase search performance on large datasets. 27 | 28 | Here are a couple examples of using the `filter` parameter to perform `Metadata` filtering when performing semantic searches: 29 | 30 | === "Sync" 31 | 32 | ```csharp 33 | var vdb = new BasicMemoryVectorDatabase(); 34 | 35 | // load text and metadata into database 36 | 37 | var query = "some text to search"; 38 | var results = vdb.Search( 39 | query, 40 | filter: (metadata) => { 41 | // perform some operation to check metadata 42 | // return true or false 43 | return metadata.Contains("B59"); 44 | } 45 | ); 46 | ``` 47 | 48 | === "Async" 49 | 50 | ```csharp 51 | var vdb = new MemoryVectorDatabase(); 52 | 53 | // load text and metadata into database 54 | 55 | var query = "some text to search"; 56 | var results = vdb.SearchAsync( 57 | query, 58 | filter: async (metadata) => { 59 | // perform some operation to check metadata 60 | // return true or false 61 | return metadata.LastName == "Pietschmann"; 62 | } 63 | ); 64 | ``` 65 | 66 | !!! info "OpenAI and Ollama Support" 67 | 68 | This functionality works the same with both [:simple-openai: OpenAI and :simple-ollama: Ollama supported vector databases](../../embeddings/index.md) too. 69 | 70 | ## Paging 71 | 72 | The `.Search` and `.SearchAsync` methods also include the ability to perform paging on the text items returned from the semantic search. This is performed after the similarity search and the `filter` has been applied to the search results. This is done using the optional `pageCount` and `pageIndex` paramters. 73 | 74 | Here are a couple examples of using the `pageCount` and `pageIndex` parameters to perform paging with the semantic search results: 75 | 76 | === "Sync" 77 | 78 | ```csharp 79 | var vdb = new BasicMemoryVectorDatabase(); 80 | 81 | // load text and metadata into database 82 | 83 | var query = "some text to search"; 84 | var results = vdb.Search( 85 | query, 86 | pageIndex: 0, // return first page of results (default: 0) 87 | pageCount: 6 // limit length of this page of results (default: unlimited) 88 | ); 89 | ``` 90 | 91 | === "Async" 92 | 93 | ```csharp 94 | var vdb = new MemoryVectorDatabase(); 95 | 96 | // load text and metadata into database 97 | 98 | var query = "some text to search"; 99 | var results = vdb.SearchAsync( 100 | query, 101 | pageIndex: 0, // return first page of results (default: 0) 102 | pageCount: 6 // limit length of this page of results (default: unlimited) 103 | ); 104 | ``` 105 | 106 | The `pageIndex` and `pageIndex` paramters are optional, and can be used individually or together. 107 | -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector.Playground/Components/Layout/NavMenu.razor.css: -------------------------------------------------------------------------------- 1 | .navbar-toggler { 2 | appearance: none; 3 | cursor: pointer; 4 | width: 3.5rem; 5 | height: 2.5rem; 6 | color: white; 7 | position: absolute; 8 | top: 0.5rem; 9 | right: 1rem; 10 | border: 1px solid rgba(255, 255, 255, 0.1); 11 | background: url("data:image/svg+xml,%3csvg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 30 30'%3e%3cpath stroke='rgba%28255, 255, 255, 0.55%29' stroke-linecap='round' stroke-miterlimit='10' stroke-width='2' d='M4 7h22M4 15h22M4 23h22'/%3e%3c/svg%3e") no-repeat center/1.75rem rgba(255, 255, 255, 0.1); 12 | } 13 | 14 | .navbar-toggler:checked { 15 | background-color: rgba(255, 255, 255, 0.5); 16 | } 17 | 18 | .top-row { 19 | height: 3.5rem; 20 | background-color: rgba(0,0,0,0.4); 21 | } 22 | 23 | .navbar-brand { 24 | font-size: 1.1rem; 25 | } 26 | 27 | .bi { 28 | display: inline-block; 29 | position: relative; 30 | width: 1.25rem; 31 | height: 1.25rem; 32 | margin-right: 0.75rem; 33 | top: -1px; 34 | background-size: cover; 35 | } 36 | 37 | .bi-house-door-fill-nav-menu { 38 | background-image: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='16' height='16' fill='white' class='bi bi-house-door-fill' viewBox='0 0 16 16'%3E%3Cpath d='M6.5 14.5v-3.505c0-.245.25-.495.5-.495h2c.25 0 .5.25.5.5v3.5a.5.5 0 0 0 .5.5h4a.5.5 0 0 0 .5-.5v-7a.5.5 0 0 0-.146-.354L13 5.793V2.5a.5.5 0 0 0-.5-.5h-1a.5.5 0 0 0-.5.5v1.293L8.354 1.146a.5.5 0 0 0-.708 0l-6 6A.5.5 0 0 0 1.5 7.5v7a.5.5 0 0 0 .5.5h4a.5.5 0 0 0 .5-.5Z'/%3E%3C/svg%3E"); 39 | } 40 | 41 | .bi-plus-square-fill-nav-menu { 42 | background-image: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='16' height='16' fill='white' class='bi bi-plus-square-fill' viewBox='0 0 16 16'%3E%3Cpath d='M2 0a2 2 0 0 0-2 2v12a2 2 0 0 0 2 2h12a2 2 0 0 0 2-2V2a2 2 0 0 0-2-2H2zm6.5 4.5v3h3a.5.5 0 0 1 0 1h-3v3a.5.5 0 0 1-1 0v-3h-3a.5.5 0 0 1 0-1h3v-3a.5.5 0 0 1 1 0z'/%3E%3C/svg%3E"); 43 | } 44 | 45 | .bi-list-nested-nav-menu { 46 | background-image: url("data:image/svg+xml,%3Csvg xmlns='http://www.w3.org/2000/svg' width='16' height='16' fill='white' class='bi bi-list-nested' viewBox='0 0 16 16'%3E%3Cpath fill-rule='evenodd' d='M4.5 11.5A.5.5 0 0 1 5 11h10a.5.5 0 0 1 0 1H5a.5.5 0 0 1-.5-.5zm-2-4A.5.5 0 0 1 3 7h10a.5.5 0 0 1 0 1H3a.5.5 0 0 1-.5-.5zm-2-4A.5.5 0 0 1 1 3h10a.5.5 0 0 1 0 1H1a.5.5 0 0 1-.5-.5z'/%3E%3C/svg%3E"); 47 | } 48 | 49 | .nav-item { 50 | font-size: 0.9rem; 51 | padding-bottom: 0.5rem; 52 | } 53 | 54 | .nav-item:first-of-type { 55 | padding-top: 1rem; 56 | } 57 | 58 | .nav-item:last-of-type { 59 | padding-bottom: 1rem; 60 | } 61 | 62 | .nav-item ::deep .nav-link { 63 | color: #d7d7d7; 64 | background: none; 65 | border: none; 66 | border-radius: 4px; 67 | height: 3rem; 68 | display: flex; 69 | align-items: center; 70 | line-height: 3rem; 71 | width: 100%; 72 | } 73 | 74 | .nav-item ::deep a.active { 75 | background-color: rgba(255,255,255,0.37); 76 | color: white; 77 | } 78 | 79 | .nav-item ::deep .nav-link:hover { 80 | background-color: rgba(255,255,255,0.1); 81 | color: white; 82 | } 83 | 84 | .nav-scrollable { 85 | display: none; 86 | } 87 | 88 | .navbar-toggler:checked ~ .nav-scrollable { 89 | display: block; 90 | } 91 | 92 | @media (min-width: 641px) { 93 | .navbar-toggler { 94 | display: none; 95 | } 96 | 97 | .nav-scrollable { 98 | /* Never collapse the sidebar for wide screens */ 99 | display: block; 100 | 101 | /* Allow sidebar to scroll for tall menus */ 102 | height: calc(100vh - 3.5rem); 103 | overflow-y: auto; 104 | } 105 | } 106 | -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector/docs/README.md: -------------------------------------------------------------------------------- 1 | Build5Nines.SharpVector is the lightweight in-memory Vector Database for use in any .NET application. 2 | 3 | The `Build5Nines.SharpVector.BasicMemoryVectorDatabase` class uses a Bag of Words vectorization strategy, with Cosine similarity, a dictionary vocabulary store, and a basic text preprocessor. 4 | 5 | ### Example Usage: Load and Search Vector Database 6 | 7 | ```csharp 8 | // Create a Vector Database with metadata of type string 9 | var vdb = new BasicMemoryVectorDatabase(); 10 | // The Metadata is declared using generics, so you can store whatever data you need there. 11 | 12 | // Load Vector Database with some sample text data 13 | // Text is the movie description, and Metadata is the movie title with release year in this example 14 | vdb.AddText("Iron Man (2008) is a Marvel Studios action, adventure, and sci-fi movie about Tony Stark (Robert Downey Jr.), a billionaire inventor and weapons developer who is kidnapped by terrorists and forced to build a weapon. Instead, Tony uses his ingenuity to build a high-tech suit of armor and escape, becoming the superhero Iron Man. He then returns to the United States to refine the suit and use it to fight crime and terrorism.", "Iron Man (2008)"); 15 | vdb.AddText("The Lion King is a 1994 Disney animated film about a young lion cub named Simba who is the heir to the throne of an African savanna.", "The Lion King (1994)"); 16 | vdb.AddText("Aladdin is a 2019 live-action Disney adaptation of the 1992 animated classic of the same name about a street urchin who finds a magic lamp and uses a genie's wishes to become a prince so he can marry Princess Jasmine.", "Alladin (2019)"); 17 | vdb.AddText("The Little Mermaid is a 2023 live-action adaptation of Disney's 1989 animated film of the same name. The movie is about Ariel, the youngest of King Triton's daughters, who is fascinated by the human world and falls in love with Prince Eric.", "The Little Mermaid"); 18 | vdb.AddText("Frozen is a 2013 Disney movie about a fearless optimist named Anna who sets off on a journey to find her sister Elsa, whose icy powers have trapped their kingdom in eternal winter.", "Frozen (2013)"); 19 | 20 | // Perform a Vector Search 21 | var result = vdb.Search(newPrompt, pageCount: 5); // return the first 5 results 22 | 23 | if (result.HasResults) 24 | { 25 | Console.WriteLine("Similar Text Found:"); 26 | foreach (var item in result.Texts) 27 | { 28 | Console.WriteLine(item.Metadata); 29 | Console.WriteLine(item.Text); 30 | } 31 | } 32 | ``` 33 | 34 | ### Example Usage: Loading with Different Text Chunking Methods 35 | 36 | Also, the `TextDataLoader` can be used to help load text documents into the Vector Database with support for multiple different text chunking methods: 37 | 38 | ```csharp 39 | /// Paragraph Chunking 40 | var loader = new TextDataLoader(vdb); 41 | loader.AddDocument(document, new TextChunkingOptions 42 | { 43 | Method = TextChunkingMethod.Paragraph, 44 | RetrieveMetadata = (chunk) => { 45 | // add some basic metadata since this can't be null 46 | return "{ chuckSize: \"" + chunk.Length + "\" }"; 47 | } 48 | }); 49 | ``` 50 | 51 | ## Tutorials 52 | 53 | Here's a couple helpful tutorial links with additional documentation and examples on using `Build5Nines.SharpVector` in your own projects: 54 | 55 | - [Perform Vector Database Similarity Search in .NET Apps using Build5Nines.SharpVector](https://build5nines.com/using-build5nines-sharpvector-for-vector-similarity-search-in-net-applications/) by Chris Pietschmann 56 | - [Build a Generative AI + RAG App in C# with Phi-3, ONNX, and SharpVector](https://build5nines.com/build-a-generative-ai-rag-app-in-c-with-phi-3-onnx-and-sharpvector/) by Chris Pietschmann 57 | - [Implementing Local RAG using Phi-3 ONNX Runtime and Sidecar Pattern on Linux App Service](https://azure.github.io/AppService/2024/09/03/Phi3-vector.html) by Tulika Chaudharie (Principal Product Manager at Microsoft for Azure App Service) 58 | -------------------------------------------------------------------------------- /docs/docs/embeddings/ollama/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Ollama Embeddings 3 | description: Integrate Ollama embedding models with SharpVector to supercharge your semantic search and AI features in .NET apps. 4 | --- 5 | 6 | # :simple-ollama: Ollama Embeddings 7 | 8 | Integrating [Ollama](https://ollama.com) embedding modes with `Build5Nines.SharpVector` enhances the semantic search capabilities of your .NET applications. By leveraging models like `nomic-embed-text` or others, you can generate higher quality vector representations of text, leading to more accurate and contextually relevant search results. 9 | 10 | ## Why Use an Ollama Embedding Model? 11 | 12 | While **SharpVector** includes basic embedding generation, utilizing an Ollama embedding model offers significant advantages: 13 | 14 | - **Improved Search Accuracy**: Embedding models capture the semantic meaning of text more accurately, resulting in more relevant search outcomes. 15 | - **Pre-trained on Extensive Data**: These models are trained on vast datasets, enhancing their robustness and generalization capabilities. 16 | - **Optimized for Performance**: Designed for efficient retrieval and indexing, Ollama embedding models facilitate faster search operations. 17 | 18 | ## Getting Started 19 | 20 | To integrate an Ollama embedding model with SharpVector, install the `Build5Nines.SharpVector.Ollama` NuGet package: 21 | 22 | === ".NET CLI" 23 | ```bash 24 | dotnet add package Build5Nines.SharpVector.Ollama 25 | ``` 26 | === "Package Manager" 27 | ```powershell 28 | Nuget\Install-Package Build5Nines.SharpVector.Ollama 29 | ``` 30 | 31 | This package includes the core `Build5Nines.SharpVector` library and dependencies required to connect to Ollama's embedding API. 32 | 33 | ## Initialize the Vector Database using Ollama 34 | 35 | With the Ollama embedding model running, initialize the the **SharpVector** database: 36 | 37 | ```csharp 38 | using Build5Nines.SharpVector.Ollama; 39 | 40 | var modelName = "nomic-embed-text"; 41 | 42 | // For connecting to Locally running ('localhost') Ollama 43 | var vectorDatabase = new BasicOllamaMemoryVectorDatabase(modelName) 44 | 45 | // For connecting to a different Ollama endpoint URL 46 | var ollamaEndpoint = "http:/localhost:11434/api/embeddings"; 47 | var vactorDatabase = new BasicOllamaMemoryVectorDatabase(ollamaEndpoint, modelName); 48 | ``` 49 | 50 | ## Adding Text Data 51 | 52 | To add text documents to the vector database: 53 | 54 | ```csharp 55 | // sync 56 | vectorDatabase.AddText(documentText, metadataText); 57 | 58 | // async 59 | await vectorDatabase.AddTextAsync(documentText, metadataText); 60 | ``` 61 | 62 | - `documentText`: The textual content to be vectorized. 63 | - `metadataText`: Associated metadata (e.g., document title, JSON string) stored alongside the vectorized text. 64 | 65 | !!! note 66 | Metadata is not vectorized but is retrieved with search results, providing context. 67 | 68 | ## Performing Similarity Search 69 | 70 | The `SearchAsync` method returns documents whose vector representations closely match the query vector, based on similarity metrics like cosine similarity. 71 | 72 | ```csharp 73 | var query = "your search query"; 74 | var results = await vectorDatabase.SearchAsync(query); 75 | ``` 76 | 77 | The `.SearchAsync` method supports additional arguments to help with searching the vector database: 78 | 79 | ```csharp 80 | var results = await vectorDatabase.SearchAsync(queryText, 81 | threshold: 0.001f // 0.2f - Cosine Similarity 82 | pageIndex: 0, // page index of search results (default: 0) 83 | pageCount: 10 // Number of results per page to return (default: no limit) 84 | ); 85 | ``` 86 | 87 | - `queryText`: The text query to search within the vector database. 88 | - `threshold`: The similarity threshold to use for searching the vector database using Cosine Similarity method. 89 | - `pageIndex`: The page index of search results to return. Default is `0`. 90 | - `pageCount`: The number of search results to return per page. Default is "no limit" (aka return all results) 91 | 92 | ## Summary 93 | 94 | Integrating an Ollama embedding model with **Build5Nines.SharpVector** empowers your .NET applications with advanced semantic search capabilities. By leveraging high-quality vector representations, you can achieve more accurate and context-aware search results, enhancing the overall user experience. 95 | -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector/VectorTextResultItem.cs: -------------------------------------------------------------------------------- 1 | using System.Collections.Immutable; 2 | using Build5Nines.SharpVector.Id; 3 | 4 | namespace Build5Nines.SharpVector; 5 | 6 | /// 7 | /// Represents a result item from a semantic search on a vector database. 8 | /// 9 | /// The type of the document. 10 | /// The type of the metadata. 11 | public interface IVectorTextResultItem 12 | { 13 | /// 14 | /// The string of text that was vectorized. 15 | /// 16 | TDocument Text{ get; } 17 | 18 | /// 19 | /// The metadata associated with the text. 20 | /// 21 | TMetadata? Metadata { get; } 22 | 23 | /// 24 | /// The vector similarity score between the query and the text. (This is deprecated, use 'Similarity' instead) 25 | /// 26 | [Obsolete("Use 'Similarity' instead")] 27 | float VectorComparison { get; } 28 | 29 | /// 30 | /// The vector similarity score between the query and the text. 31 | /// 32 | float Similarity { get; } 33 | } 34 | 35 | /// 36 | /// Represents a result item from a semantic search on a vector database. 37 | /// 38 | /// The type of the ID. 39 | /// The type of the document. 40 | /// The type of the metadata. 41 | public interface IVectorTextResultItem 42 | : IVectorTextResultItem 43 | { 44 | TId Id { get; } 45 | } 46 | 47 | /// 48 | /// Represents a result item from a semantic search on a vector database. 49 | /// 50 | /// The type of the metadata. 51 | public interface IVectorTextResultItem 52 | : IVectorTextResultItem, IVectorTextResultItem 53 | { } 54 | 55 | /// 56 | /// Represents a result item from a semantic search on a vector database. 57 | /// 58 | /// The type of the ID. 59 | /// The type of the document. 60 | /// The type of the metadata. 61 | public class VectorTextResultItem 62 | : IVectorTextResultItem, IVectorTextResultItem 63 | { 64 | private IVectorTextItem _item; 65 | private TId _id; 66 | 67 | public VectorTextResultItem(TId id, IVectorTextItem item, float similarity) 68 | { 69 | _id = id; 70 | _item = item; 71 | Similarity = similarity; 72 | } 73 | 74 | /// 75 | /// The string of text that was vectorized. 76 | /// 77 | public TDocument Text { get => _item.Text; } 78 | 79 | /// 80 | /// The metadata associated with the text. 81 | /// 82 | public TMetadata? Metadata { get => _item.Metadata; } 83 | public TId Id { get => _id; } 84 | 85 | /// 86 | /// The vector representation / embeddings of the text. 87 | /// 88 | public ImmutableArray Vectors { get => ImmutableArray.Create(_item.Vector); } 89 | 90 | /// 91 | /// The vector similarity score between the query and the text. 92 | /// 93 | public float Similarity { get; private set; } 94 | 95 | /// 96 | /// The vector similarity score between the query and the text. (This is deprecated, use 'Similarity' instead) 97 | /// 98 | [Obsolete("Use 'Similarity' instead")] 99 | public float VectorComparison { get => Similarity; } 100 | } 101 | 102 | /// 103 | /// Represents a result item from a semantic search on a vector database. 104 | /// 105 | /// The type of the metadata. 106 | public class VectorTextResultItem 107 | : VectorTextResultItem, IVectorTextResultItem 108 | { 109 | public VectorTextResultItem(int id, IVectorTextItem item, float vectorComparison) 110 | : base(id, item, vectorComparison) 111 | { } 112 | } 113 | 114 | -------------------------------------------------------------------------------- /src/OpenAIConsoleTest/Program.cs: -------------------------------------------------------------------------------- 1 | // See https://aka.ms/new-console-template for more information 2 | 3 | using System.Diagnostics; 4 | using System.Text.Json; 5 | using Azure; 6 | using Azure.AI.OpenAI; 7 | using Build5Nines.SharpVector; 8 | using Build5Nines.SharpVector.OpenAI; 9 | 10 | Console.WriteLine("Hello, World!"); 11 | 12 | //var openAIUri = new Uri("https://api.openai.com/"); 13 | var openAIUri = new Uri("https://{name}.openai.azure.com/"); 14 | var openAIKey = "xxxxxxxxxx"; 15 | var modelName = "text-embedding-ada-002"; 16 | 17 | var openAIClient = new AzureOpenAIClient(openAIUri, new AzureKeyCredential(openAIKey)); 18 | 19 | var embeddingClient = openAIClient.GetEmbeddingClient(modelName); 20 | 21 | var vdb = new BasicOpenAIMemoryVectorDatabase(embeddingClient); 22 | 23 | 24 | var jsonString = await File.ReadAllTextAsync("movies.json"); 25 | 26 | var importTimer = new Stopwatch(); 27 | importTimer.Start(); 28 | 29 | 30 | 31 | using (JsonDocument document = JsonDocument.Parse(jsonString)) 32 | { 33 | JsonElement root = document.RootElement; 34 | JsonElement movies = root.GetProperty("movies"); 35 | 36 | await Parallel.ForEachAsync(movies.EnumerateArray(), async (movie, cancellationToken) => 37 | { 38 | Console.WriteLine($"Processing movie: {movie.GetProperty("title").GetString()}"); 39 | 40 | var text = movie.GetProperty("description").GetString(); 41 | var metadata = movie.GetProperty("title").GetString(); 42 | 43 | if (!string.IsNullOrWhiteSpace(text) && !string.IsNullOrWhiteSpace(metadata)) 44 | { 45 | await vdb.AddTextAsync(text, metadata); 46 | } 47 | }); 48 | 49 | // foreach (JsonElement movie in movies.EnumerateArray()) 50 | // { 51 | // var text = movie.GetProperty("description").GetString(); 52 | // var metadata = movie.GetProperty("title").GetString(); 53 | 54 | // if (!string.IsNullOrWhiteSpace(text) && !string.IsNullOrWhiteSpace(metadata)) 55 | // { 56 | // await vdb.AddTextAsync(text, metadata); 57 | // } 58 | // } 59 | } 60 | 61 | importTimer.Stop(); 62 | Console.WriteLine("Movie data imported into Vector Database."); 63 | Console.WriteLine($"Import took {importTimer.ElapsedMilliseconds} ms"); 64 | 65 | // Allow user to search for similar text 66 | Console.WriteLine("Type in prompt text, or type 'exit' to exit the app."); 67 | Console.WriteLine("What movie or TV show are you looking for? Try describing it in a few words."); 68 | 69 | 70 | while(true) { 71 | Console.Write("Prompt: "); 72 | var newPrompt = Console.ReadLine(); 73 | if (newPrompt == "exit") { 74 | break; 75 | } 76 | 77 | Console.WriteLine(string.Empty); 78 | 79 | if (newPrompt != null) { 80 | var timer = new Stopwatch(); 81 | timer.Start(); 82 | 83 | var pageSize = 3; 84 | // result = await vdb.Search(newPrompt, 85 | var result = await vdb.SearchAsync(newPrompt, 86 | threshold: 0.001f, // 0.2f, // Cosine Similarity - Only return results with similarity greater than this threshold 87 | // threshold: (float)1.4f, // Euclidean Distance - Only return results with distance less than this threshold 88 | 89 | //pageIndex: 0, // Page index of the search results (default is 0; the first page) 90 | pageCount: pageSize // Number of search results per page or max number to return 91 | ); 92 | 93 | timer.Stop(); 94 | Console.WriteLine($"Search took {timer.ElapsedMilliseconds} ms"); 95 | 96 | 97 | if (result == null || result.IsEmpty) 98 | { 99 | Console.WriteLine("No similar text found."); 100 | } else { 101 | Console.WriteLine("Similar Text Found!"); 102 | 103 | var firstItemIndex = result.PageIndex * pageSize + 1; 104 | var lastItemIndex = firstItemIndex + (pageSize > result.Texts.Count() ? result.Texts.Count() : pageSize) - 1; 105 | 106 | Console.WriteLine($"Page: {result.PageIndex + 1} (Showing {firstItemIndex} to {lastItemIndex} of Total {result.TotalCount})"); 107 | Console.WriteLine(string.Empty); 108 | foreach (var item in result.Texts) 109 | { 110 | Console.WriteLine($"Metadata: {item.Metadata}"); 111 | Console.WriteLine($"Vector Comparison: {item.VectorComparison}"); 112 | Console.WriteLine(item.Text); 113 | Console.WriteLine(string.Empty); 114 | } 115 | } 116 | } 117 | } -------------------------------------------------------------------------------- /docs/docs/persistence/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Data Persistence 3 | --- 4 | # :octicons-file-24: Data Persistence 5 | 6 | The `Build5Nines.SharpVector` library provides easy-to-use methods for saving a memory-based vector database to a file or stream and loading it again later. This is particularly useful for caching indexed content between runs, deploying pre-built vector stores, or shipping databases with your application. 7 | 8 | --- 9 | 10 | ## :material-file: File Persistence 11 | 12 | `Build5Nines.SharpVector` supports persisting the vector database to a file. 13 | 14 | !!! info 15 | This functionality is implemented as methods available to both the `Build5Nines.SharpVector.BasicMemoryVectorDatabase` and `Build5Nines.SharpVector.OpenAI.BasicOpenAIMemoryVectorDatabase`. These methods are actually extensions on the base `IVectorDatabase` interface, so all implementations of this interface will have this capability. 16 | 17 | ### Save to File 18 | 19 | To persist your `BasicMemoryVectorDatabase` to disk, use the `SaveToFile` or `SaveToFileAsync` methods: 20 | 21 | ```csharp 22 | var vdb = new BasicMemoryVectorDatabase(); 23 | 24 | var filePath = "vectordata.b59vdb"; 25 | 26 | // persist vector database to file asynchronously 27 | await vdb.SaveToFileAsync(filePath); 28 | 29 | // -- or -- 30 | 31 | // persist vector database to file 32 | vdb.SaveToFile(filePath); 33 | ``` 34 | 35 | !!! info 36 | The file extension used in this example is `.b59vdb`, however this is arbitrary. The library doesn't look at the file extension. It only reads the binary contents of the file; which is actually in ZIP file format. 37 | 38 | ### Load from File 39 | 40 | To load a previously saved vector database from disk, use the `LoadFromFile` or `LoadFromFileAsync` methods: 41 | 42 | ```csharp 43 | var vdb = new BasicMemoryVectorDatabase(); 44 | 45 | var filePath = "vectordata.b59vdb"; 46 | 47 | // load vector database from file 48 | vdb.LoadFromFile(filePath); 49 | 50 | // -- or -- 51 | 52 | // load vector database from file asynchronously 53 | await vdb.LoadFromFileAsync(filePath); 54 | ``` 55 | 56 | --- 57 | 58 | ## :material-file-move: Persist to Stream 59 | 60 | The underlying methods used by `SaveToFile` and `LoadFromFile` methods for serializing the vector database to a `Stream` are available to use directly. This provides support for reading/writing to `MemoryStream` (or other streams) if the vector database needs to be persisted to something other than the local file system. 61 | 62 | !!! info 63 | These `SerializeToBinaryStream` and `DeserializeFromBinaryStream` methods are available in `v2.0.2` and later. 64 | 65 | ### Write to Stream 66 | 67 | To persist your `BasicMemoryVectorDatabase` to a JSON stream, use the `SerializeToBinaryStream` or `SerializeToBinaryStreamAsync` methods: 68 | 69 | ```csharp 70 | var vdb = new BasicMemoryVectorDatabase(); 71 | 72 | var stream = new MemoryStream(); 73 | 74 | // serialize to JSON stream 75 | vdb.SerializeToBinaryStream(stream); 76 | 77 | // -- or -- 78 | 79 | // serialize asynchronously to JSON stream 80 | await vdb.SerializeToBinaryStreamAsync(stream); 81 | ``` 82 | 83 | ### Read from Stream 84 | 85 | To load your `BasicMemoryVectorDatabase` from JSON stream, use the `DeserializeFromBinaryStream` and `DeserializeFromBinaryStreamAsync` methods: 86 | 87 | ```csharp 88 | // Be sure Stream position is at the start 89 | stream.Position = 0; 90 | 91 | // deserialize from JSON stream 92 | vdb.DeserializeFromBinaryStream(stream); 93 | 94 | // -- or --- 95 | 96 | // deserialize asynchronously from JSON stream 97 | await vdb.DeserializeFromBinaryStreamAsync(stream); 98 | ``` 99 | 100 | --- 101 | 102 | ## :material-database-outline: BasicDiskVectorDatabase 103 | 104 | The `BasicDiskVectorDatabase` provides a basic vector database implementation that automatically stores the vector store and vocabulary store to disk. It's implmentation of vectorization is the same as the `BasicMemoryVectorDatabase`, but with the modification that it automatically persists the database to disk in the background to the specified folder path. 105 | 106 | Here's a basic example of using `BasicDiskVectorDatabase`: 107 | 108 | ```csharp 109 | // specify the folder where to persist the database data on disk 110 | var vdb = new BasicDiskVectorDatabase("C:/data/content-db"); 111 | foreach (var doc in documents) 112 | { 113 | vdb.AddText(doc.Id, doc.Text); 114 | } 115 | 116 | var results = vdb.Search("some text"); 117 | 118 | ``` 119 | 120 | ### Tips 121 | 122 | - Prefer absolute paths for the storage folder in production services. 123 | - Place the folder on fast storage (SSD) for best indexing/query performance. 124 | - Avoid sharing the same folder across multiple processes concurrently. 125 | - Back up the folder regularly to preserve your vector store and vocabulary. 126 | -------------------------------------------------------------------------------- /docs/docs/embeddings/openai/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: OpenAI Embeddings 3 | description: Integrate OpenAI’s powerful embeddings with SharpVector to supercharge your semantic search and AI features in .NET apps. 4 | --- 5 | 6 | # :simple-openai: OpenAI Embeddings 7 | 8 | Integrating OpenAI embeddings with **Build5Nines.SharpVector** enhances the semantic search capabilities of your .NET applications. By leveraging models like `text-embedding-ada-002`, you can generate high-quality vector representations of text, leading to more accurate and contextually relevant search results. 9 | 10 | ## Why Use OpenAI Embeddings? 11 | 12 | While **SharpVector** includes basic embedding generation, utilizing OpenAI's advanced models offers significant advantages: 13 | 14 | - **Improved Search Accuracy**: OpenAI's embeddings capture the semantic meaning of text, resulting in more relevant search outcomes. 15 | - **Pre-trained on Extensive Data**: These models are trained on vast datasets, enhancing their robustness and generalization capabilities. 16 | - **Optimized for Performance**: Designed for efficient retrieval and indexing, OpenAI embeddings facilitate faster search operations. 17 | 18 | ## Getting Started 19 | 20 | ### Installation 21 | 22 | To integrate OpenAI embeddings with SharpVector, install the `Build5Nines.SharpVector.OpenAI` NuGet package: 23 | 24 | === ".NET CLI" 25 | ```bash 26 | dotnet add package Build5Nines.SharpVector.OpenAI 27 | ``` 28 | === "Package Manager" 29 | ```powershell 30 | Nuget\Install-Package Build5Nines.SharpVector.OpenAI 31 | ``` 32 | 33 | This package includes the core `Build5Nines.SharpVector` library and dependencies required to connect to OpenAI's embedding services. 34 | 35 | ### Setting Up the Embedding Client 36 | 37 | === "OpenAI" 38 | If you're using OpenAI's API directly: 39 | 40 | ```csharp 41 | using OpenAI; 42 | 43 | var openAIKey = "your-api-key"; 44 | var modelName = "text-embedding-ada-002"; 45 | 46 | var openAIClient = new OpenAIClient(openAIKey); 47 | var embeddingClient = openAIClient.GetEmbeddingClient(modelName); 48 | ``` 49 | === "Azure OpenAI" 50 | For applications utilizing Azure OpenAI: 51 | 52 | ```csharp 53 | using Azure; 54 | using Azure.AI.OpenAI; 55 | 56 | var openAIUri = new Uri("https://your-resource-name.openai.azure.com/"); 57 | var openAIKey = "your-api-key"; 58 | var modelName = "text-embedding-ada-002"; 59 | 60 | var openAIClient = new AzureOpenAIClient(openAIUri, new AzureKeyCredential(openAIKey)); 61 | var embeddingClient = openAIClient.GetEmbeddingClient(modelName); 62 | ``` 63 | 64 | ### Initializing the Vector Database 65 | 66 | With the embedding client set up, initialize the in-memory vector database: 67 | 68 | ```csharp 69 | using Build5Nines.SharpVector.OpenAI; 70 | 71 | var vectorDatabase = new BasicOpenAIMemoryVectorDatabase(embeddingClient); 72 | ``` 73 | 74 | - `embeddingClient`: The OpenAI Embedding Client ot use for generating the vector embeddings. 75 | 76 | ## Adding Text Data 77 | 78 | To add text documents to the vector database: 79 | 80 | ```csharp 81 | // sync 82 | vectorDatabase.AddText(documentText, metadataText); 83 | 84 | // async 85 | await vectorDatabase.AddTextAsync(documentText, metadataText); 86 | ``` 87 | 88 | - `documentText`: The textual content to be vectorized. 89 | - `metadataText`: Associated metadata (e.g., document title, JSON string) stored alongside the vectorized text. 90 | 91 | !!! note 92 | Metadata is not vectorized but is retrieved with search results, providing context. 93 | 94 | ## Performing Similarity Search 95 | 96 | The `SearchAsync` method returns documents whose vector representations closely match the query vector, based on similarity metrics like cosine similarity. 97 | 98 | ```csharp 99 | var query = "your search query"; 100 | var results = await vectorDatabase.SearchAsync(query); 101 | ``` 102 | 103 | The `.SearchAsync` method supports additional arguments to help with searching the vector database: 104 | 105 | ```csharp 106 | var results = await vectorDatabase.SearchAsync(queryText, 107 | threshold: 0.001f // 0.2f - Cosine Similarity 108 | pageIndex: 0, // page index of search results (default: 0) 109 | pageCount: 10 // Number of results per page to return (default: no limit) 110 | ); 111 | ``` 112 | 113 | - `queryText`: The text query to search within the vector database. 114 | - `threshold`: The similarity threshold to use for searching the vector database using Cosine Similarity method. 115 | - `pageIndex`: The page index of search results to return. Default is `0`. 116 | - `pageCount`: The number of search results to return per page. Default is "no limit" (aka return all results) 117 | 118 | ## Summary 119 | 120 | Integrating OpenAI embeddings with **Build5Nines.SharpVector** empowers your .NET applications with advanced semantic search capabilities. By leveraging high-quality vector representations, you can achieve more accurate and context-aware search results, enhancing the overall user experience. 121 | -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector/Preprocessing/BasicTextPreprocessor.cs: -------------------------------------------------------------------------------- 1 | namespace Build5Nines.SharpVector.Preprocessing; 2 | 3 | using System.Globalization; 4 | using System.Text; 5 | using System.Text.RegularExpressions; 6 | 7 | public class BasicTextPreprocessor : ITextPreprocessor 8 | { 9 | private const string space = " "; 10 | private const char charSpace = ' '; 11 | 12 | private const string regexChineseCharactersPattern = @"\p{IsCJKUnifiedIdeographs}"; 13 | private const string regexRemovePunctuation = @"[\p{P}$^`~=+|<>]"; // @"[\p{P}]"; 14 | // private const string regexTokenize = @"[\p{IsCJKUnifiedIdeographs}]|\p{So}\p{Sk}|[a-z0-9]+"; 15 | private const string regexWhitespacePattern = @"\s+"; 16 | private const string regexEmojiPattern = @"[\p{So}\uD83C-\uDBFF\uDC00-\uDFFF]"; 17 | 18 | public IEnumerable TokenizeAndPreprocess(string text) 19 | { 20 | if (string.IsNullOrWhiteSpace(text)) return Array.Empty(); 21 | 22 | // Tokens should always be lower case 23 | text = text.ToLower(); 24 | 25 | // Remove punctuation (excluding Chinese characters) 26 | text = Regex.Replace(text, regexRemovePunctuation, string.Empty); 27 | 28 | // Space pad special characters (Emoji and Chinese characters) 29 | text = SpacePadSpecialCharacters(text); 30 | 31 | // Remove extra whitespace characters 32 | text = Regex.Replace(text, regexWhitespacePattern, space).Trim(); 33 | 34 | // Split to Token array 35 | return text.Split(charSpace); 36 | 37 | 38 | // // Check if text contains Chinese characters using the CJK Unified Ideographs block 39 | // if (Regex.IsMatch(text, regexChineseCharactersPattern)) 40 | // { 41 | // if (Regex.IsMatch(text, regexEmojiPattern)) 42 | // { 43 | // // Has Emoji 44 | // text = SpacePadSpecialCharacters(text, new string[] { regexEmojiPattern, regexChineseCharactersPattern }); 45 | // // remove extra whitespace characters 46 | // text = Regex.Replace(text, regexWhitespacePattern, space).Trim(); 47 | // } else { 48 | // // No Emoji 49 | // // Tokenize either by matching individual Chinese characters or contiguous word tokens (for Latin letters/digits) 50 | // var tokens = Regex.Matches(text, regexTokenize) 51 | // .Cast() 52 | // .Select(m => m.Value); 53 | // return tokens; 54 | // } 55 | // } 56 | // else 57 | // { 58 | // // if text contains emojis 59 | // if (Regex.IsMatch(text, regexEmojiPattern)) 60 | // { 61 | // text = SpacePadSpecialCharacters(text, new string[] { regexEmojiPattern }); 62 | // } 63 | 64 | // // remove extra whitespace characters 65 | // text = Regex.Replace(text, regexWhitespacePattern, space).Trim(); 66 | // } 67 | 68 | // return text.Split(charSpace); 69 | } 70 | 71 | public async Task> TokenizeAndPreprocessAsync(string text) 72 | { 73 | return await Task.Run(() => TokenizeAndPreprocess(text)); 74 | } 75 | 76 | 77 | private static string SpacePadSpecialCharacters(string text) 78 | { 79 | var spacePadPatterns = new List(); 80 | 81 | // Contains Chinese characters? 82 | if (Regex.IsMatch(text, regexChineseCharactersPattern)) 83 | { 84 | // Space pad Chinese characters 85 | spacePadPatterns.Add(regexChineseCharactersPattern); 86 | } 87 | 88 | // Contains Emoji? 89 | if (Regex.IsMatch(text, regexEmojiPattern)) 90 | { 91 | // Space pad Emoji characters 92 | spacePadPatterns.Add(regexEmojiPattern); 93 | } 94 | 95 | if (spacePadPatterns.Count > 0) 96 | { 97 | // Space pad special characters based on the patterns selected 98 | text = SpacePadSpecialCharacters(text, spacePadPatterns.ToArray()); 99 | } 100 | 101 | return text; 102 | } 103 | 104 | private static string SpacePadSpecialCharacters(string text, string[] regexPatterns) 105 | { 106 | var enumerator = StringInfo.GetTextElementEnumerator(text); 107 | StringBuilder sb = new StringBuilder(); 108 | int i; 109 | while(enumerator.MoveNext()) 110 | { 111 | var element = enumerator.GetTextElement(); 112 | 113 | for (i = 0; i < regexPatterns.Length; i++) 114 | { 115 | if (Regex.IsMatch(element, regexPatterns[i])) 116 | { 117 | element = space + element + space; 118 | break; 119 | } 120 | } 121 | 122 | sb.Append(element); 123 | } 124 | return sb.ToString(); 125 | } 126 | } 127 | -------------------------------------------------------------------------------- /docs/docs/get-started/metadata/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Metadata 3 | --- 4 | # :material-database-cog-outline: Metadata 5 | 6 | The `Build5Nines.SharpVector` vector database enables semantic search for `Text` that is stored in the database. Being able to semantically search text is an extremely useful way to lookup more information related to the text. For this purpose, `Metadata` is stored alongside the `Text` within the vector database. This way, when `Text` is found when performing a semantic search, then the matching `Metadata` is also retrieved. 7 | 8 | ## Adding Metadata 9 | 10 | The `.AddText` and `.AddTextAsync` methods access 2 arguments: 11 | 12 | - `text`: The `Text` that is added to the vector database and has vector embeddings generated for. 13 | - `metadata`: This is additional data / information that is stored alongside the `Text`. 14 | 15 | ```csharp 16 | vdb.AddText(text, metadata); 17 | 18 | await vdb.AddText(text, metadata); 19 | ``` 20 | 21 | ## JSON and String Metadata 22 | 23 | When using the `BasicMemoryVectorDatabase` class, the `Metadata` values will always be of type `String`. This enables you to store a variety of values here, including: 24 | 25 | - **JSON data**: You can serialize any data to a JSON string for storage in the `Metadata` associated with a text item in the database. 26 | - **`String` value**: You can store any other string value as the `Metadata` associated with a text item in the database. This could be a URL, Filename, or other information. 27 | 28 | !!! info "OpenAI and Ollama Support" 29 | When working with the [OpenAI](../../embeddings/openai/index.md) `BasicOpenAIMemoryVectorDatabase` and [Ollama](../../embeddings/ollama/index.md) `BasicOllamaMemoryVectorDatabase`, the `Metadata` data type is also `String`. 30 | 31 | Here are some examples of storing `string` metadata and retrieving it from the database: 32 | 33 | === "JSON data" 34 | 35 | ```csharp 36 | // create vector database 37 | var vdb = new BasicMemoryVectorDatabase(); 38 | 39 | // some text to store in the vector database 40 | var text = "some text value"; 41 | // serialize an object to json to store as metadata 42 | var json = JsonSerializer.Serialize(new MyMetadata{ 43 | Url = "https://build5nines.com", 44 | Author = "Chris Pietschmann" 45 | }); 46 | 47 | // Add text with metadata to vector database 48 | vdb.AddText(text, json); 49 | 50 | // perform semantic search 51 | var results = vdb.Search("something to search", pageCount: 5); 52 | 53 | // Loop through search results 54 | foreach(var item in results.Texts) { 55 | var text = item.Text; 56 | var json = item.Metadata; 57 | var metadata = JsonSerializer.Deserialize(json); 58 | 59 | // do something with results and metadata 60 | } 61 | ``` 62 | 63 | === "String value" 64 | 65 | ```csharp 66 | // create vector database 67 | var vdb = new BasicMemoryVectorDatabase(); 68 | 69 | // some text to store in the vector database 70 | var text = "some text value"; 71 | // some metadata to store 72 | var metadata = "https://build5nines.com"; 73 | 74 | // Add text with metadata to vector database 75 | vdb.AddText(text, metadata); 76 | 77 | // perform semantic search 78 | var results = vdb.Search("something to search", pageCount: 5); 79 | 80 | // Loop through search results 81 | foreach(var item in results.Texts) { 82 | var text = item.Text; 83 | var metadata = item.Metadata; 84 | 85 | // do something with results and metadata 86 | } 87 | ``` 88 | 89 | ## Custom Metadata Type 90 | 91 | The `MemoryVectorDatabase` generic class allows you to create a vector database that uses your own custom class as the metadata by defining that class using generics. This enables you to store a native .NET object as the metadata alongside the text in the vector database. 92 | 93 | Here's an example of using the `MemoryVectorDatabase` with a .NET class for the `Metadata`: 94 | 95 | ```csharp 96 | // create vector database 97 | var vdb = new MemoryVectorDatabase(); 98 | 99 | // some text to store in the vector database 100 | var text = "some text value"; 101 | // an object to store as metadata 102 | var metadata = new MyMetadata{ 103 | Url = "https://build5nines.com", 104 | Author = "Chris Pietschmann" 105 | }; 106 | 107 | // Add text with metadata to vector database 108 | vdb.AddText(text, metadata); 109 | 110 | // perform semantic search 111 | var results = vdb.Search("something to search", pageCount: 5); 112 | 113 | // Loop through search results 114 | foreach(var item in results.Texts) { 115 | var text = item.Text; 116 | var metadata = item.Metadata; 117 | 118 | var url = metadata.Url; 119 | var author = metadata.Author; 120 | 121 | // do something with results and metadata 122 | } 123 | ``` 124 | 125 | This will offer better performance with scenarios that require more complex metadata since you no longer need to handle serialization to/from JSON. 126 | 127 | !!! info "OpenAI and Ollama Support" 128 | The `OpenAIMemoryVectorDatabase` and `OllamaMemoryVectorDatabase` generic classes can also be used to define your own `Metadata` type when working with [OpenAI and Ollama embeddings](../../embeddings/index.md). 129 | -------------------------------------------------------------------------------- /samples/azure/document-intelligence/b59-azure-doc-intelligence/Program.cs: -------------------------------------------------------------------------------- 1 | using Azure; 2 | using Azure.AI.DocumentIntelligence; 3 | using System; 4 | using System.Text; 5 | using System.IO; 6 | using System.Threading.Tasks; 7 | using Build5Nines.SharpVector; 8 | 9 | // This sample demonstrates how to use the Document Intelligence client library to analyze a document using the prebuilt-read model. 10 | string endpoint = "https://.cognitiveservices.azure.com/"; 11 | string apiKey = ""; 12 | string filePath = "document.pdf"; // Can be .pdf, .docx, .jpg, etc. 13 | 14 | // Create timers to measure how long it takes to run the code 15 | var overallTimer = new System.Diagnostics.Stopwatch(); 16 | var stepTimer = new System.Diagnostics.Stopwatch(); 17 | overallTimer.Start(); 18 | 19 | 20 | // Create a DocumentIntelligenceClient 21 | var credential = new AzureKeyCredential(apiKey); 22 | var client = new DocumentIntelligenceClient(new Uri(endpoint), credential); 23 | 24 | var vdb = new BasicMemoryVectorDatabase(); 25 | 26 | 27 | 28 | 29 | 30 | // Read the file into a BinaryData object 31 | Console.WriteLine("Reading file..."); 32 | stepTimer.Start(); 33 | 34 | using var stream = File.OpenRead(filePath); 35 | byte[] buffer = new byte[stream.Length]; 36 | await stream.ReadAsync(buffer, 0, buffer.Length); 37 | var binaryData = BinaryData.FromBytes(buffer); 38 | 39 | stepTimer.Stop(); 40 | Console.WriteLine($"File loaded into memory: {stepTimer.ElapsedMilliseconds} ms"); 41 | 42 | Console.WriteLine("Analyzing document with Azure Document Intelligence..."); 43 | stepTimer.Restart(); 44 | 45 | // Analyze the document using the prebuilt-read model 46 | var operation = await client.AnalyzeDocumentAsync( 47 | WaitUntil.Completed, 48 | "prebuilt-read", 49 | binaryData); 50 | 51 | var docResult = operation.Value; 52 | 53 | stepTimer.Stop(); 54 | Console.WriteLine($"Document analysis completed: {stepTimer.ElapsedMilliseconds} ms"); 55 | 56 | stepTimer.Restart(); 57 | Console.WriteLine("Loading SharpVector database..."); 58 | 59 | foreach (var page in docResult.Pages) 60 | { 61 | var sb = new StringBuilder(); 62 | foreach (var line in page.Lines) 63 | { 64 | sb.AppendLine(line.Content); 65 | } 66 | 67 | // Add the text to the vector database 68 | // Let's use the Page Number as the metadata 69 | // Note: In a real-world scenario, you might want to use more meaningful metadata 70 | var textMetadata = page.PageNumber.ToString(); 71 | vdb.AddText(sb.ToString(), textMetadata); 72 | } 73 | 74 | stepTimer.Stop(); 75 | Console.WriteLine($"SharpVector database loaded: {stepTimer.ElapsedMilliseconds} ms"); 76 | 77 | 78 | 79 | 80 | 81 | // Console.WriteLine(""); 82 | // Console.WriteLine("Loading PDF File into vector database..."); 83 | // stepTimer.Restart(); 84 | // // read pdf file with PdfPig locally 85 | // var vdb2 = new BasicMemoryVectorDatabase(); 86 | // using (var pdfDocument = UglyToad.PdfPig.PdfDocument.Open(filePath)) 87 | // { 88 | // foreach (var page in pdfDocument.GetPages()) 89 | // { 90 | // // Add the text to the vector database 91 | // // Let's use the Page Number as the metadata 92 | // // Note: In a real-world scenario, you might want to use more meaningful metadata 93 | // var metadata = page.Number.ToString(); 94 | // vdb.AddText(page.Text, metadata); 95 | // } 96 | // } 97 | // stepTimer.Stop(); 98 | // Console.WriteLine($"Vector database loaded: {stepTimer.ElapsedMilliseconds} ms"); 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | Console.WriteLine(""); 108 | Console.WriteLine("Searching in SharpVector database for \"Azure ML\" with similarity score > 0.5..."); 109 | stepTimer.Restart(); 110 | 111 | var query = "Azure ML"; 112 | var semanticResults = vdb.Search( 113 | query, 114 | threshold: 0.5f // Set a threshold for the similarity score to only match results above this value 115 | ); 116 | 117 | stepTimer.Stop(); 118 | Console.WriteLine($"Search completed: {stepTimer.ElapsedMilliseconds} ms"); 119 | 120 | 121 | Console.WriteLine("Top Matching Results:"); 122 | foreach (var result in semanticResults.Texts) 123 | { 124 | //var text = result.Text; 125 | var metadata = result.Metadata; 126 | var similarity = result.VectorComparison; 127 | Console.WriteLine($" - Page: {metadata} - Similarity: {similarity}"); 128 | } 129 | 130 | 131 | Console.WriteLine(""); 132 | Console.WriteLine("Searching in SharpVector database for \"Why use a Cloud Adoption Framework strategy\", top 3 results..."); 133 | stepTimer.Restart(); 134 | 135 | query = "Why use a Cloud Adoption Framework strategy"; 136 | semanticResults = vdb.Search( 137 | query, 138 | pageCount: 3 // Set the number of top results to return 139 | ); 140 | 141 | stepTimer.Stop(); 142 | Console.WriteLine($"Search completed: {stepTimer.ElapsedMilliseconds} ms"); 143 | 144 | 145 | Console.WriteLine("Top Matching Results:"); 146 | foreach (var result in semanticResults.Texts) 147 | { 148 | //var text = result.Text; 149 | var metadata = result.Metadata; 150 | var similarity = result.VectorComparison; 151 | Console.WriteLine($" - Page: {metadata} - Similarity: {similarity}"); 152 | } 153 | 154 | overallTimer.Stop(); 155 | Console.WriteLine(""); 156 | Console.WriteLine($"Overall processing time: {overallTimer.ElapsedMilliseconds} ms"); -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | We as members, contributors, and leaders pledge to make participation in our 6 | community a harassment-free experience for everyone, regardless of age, body 7 | size, visible or invisible disability, ethnicity, sex characteristics, gender 8 | identity and expression, level of experience, education, socio-economic status, 9 | nationality, personal appearance, race, religion, or sexual identity 10 | and orientation. 11 | 12 | We pledge to act and interact in ways that contribute to an open, welcoming, 13 | diverse, inclusive, and healthy community. 14 | 15 | ## Our Standards 16 | 17 | Examples of behavior that contributes to a positive environment for our 18 | community include: 19 | 20 | * Demonstrating empathy and kindness toward other people 21 | * Being respectful of differing opinions, viewpoints, and experiences 22 | * Giving and gracefully accepting constructive feedback 23 | * Accepting responsibility and apologizing to those affected by our mistakes, 24 | and learning from the experience 25 | * Focusing on what is best not just for us as individuals, but for the 26 | overall community 27 | 28 | Examples of unacceptable behavior include: 29 | 30 | * The use of sexualized language or imagery, and sexual attention or 31 | advances of any kind 32 | * Trolling, insulting or derogatory comments, and personal or political attacks 33 | * Public or private harassment 34 | * Publishing others' private information, such as a physical or email 35 | address, without their explicit permission 36 | * Other conduct which could reasonably be considered inappropriate in a 37 | professional setting 38 | 39 | ## Enforcement Responsibilities 40 | 41 | Community leaders are responsible for clarifying and enforcing our standards of 42 | acceptable behavior and will take appropriate and fair corrective action in 43 | response to any behavior that they deem inappropriate, threatening, offensive, 44 | or harmful. 45 | 46 | Community leaders have the right and responsibility to remove, edit, or reject 47 | comments, commits, code, wiki edits, issues, and other contributions that are 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation 49 | decisions when appropriate. 50 | 51 | ## Scope 52 | 53 | This Code of Conduct applies within all community spaces, and also applies when 54 | an individual is officially representing the community in public spaces. 55 | Examples of representing our community include using an official e-mail address, 56 | posting via an official social media account, or acting as an appointed 57 | representative at an online or offline event. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported to the community leaders responsible for enforcement at 63 | chris@build5nines.com. 64 | All complaints will be reviewed and investigated promptly and fairly. 65 | 66 | All community leaders are obligated to respect the privacy and security of the 67 | reporter of any incident. 68 | 69 | ## Enforcement Guidelines 70 | 71 | Community leaders will follow these Community Impact Guidelines in determining 72 | the consequences for any action they deem in violation of this Code of Conduct: 73 | 74 | ### 1. Correction 75 | 76 | **Community Impact**: Use of inappropriate language or other behavior deemed 77 | unprofessional or unwelcome in the community. 78 | 79 | **Consequence**: A private, written warning from community leaders, providing 80 | clarity around the nature of the violation and an explanation of why the 81 | behavior was inappropriate. A public apology may be requested. 82 | 83 | ### 2. Warning 84 | 85 | **Community Impact**: A violation through a single incident or series 86 | of actions. 87 | 88 | **Consequence**: A warning with consequences for continued behavior. No 89 | interaction with the people involved, including unsolicited interaction with 90 | those enforcing the Code of Conduct, for a specified period of time. This 91 | includes avoiding interactions in community spaces as well as external channels 92 | like social media. Violating these terms may lead to a temporary or 93 | permanent ban. 94 | 95 | ### 3. Temporary Ban 96 | 97 | **Community Impact**: A serious violation of community standards, including 98 | sustained inappropriate behavior. 99 | 100 | **Consequence**: A temporary ban from any sort of interaction or public 101 | communication with the community for a specified period of time. No public or 102 | private interaction with the people involved, including unsolicited interaction 103 | with those enforcing the Code of Conduct, is allowed during this period. 104 | Violating these terms may lead to a permanent ban. 105 | 106 | ### 4. Permanent Ban 107 | 108 | **Community Impact**: Demonstrating a pattern of violation of community 109 | standards, including sustained inappropriate behavior, harassment of an 110 | individual, or aggression toward or disparagement of classes of individuals. 111 | 112 | **Consequence**: A permanent ban from any sort of public interaction within 113 | the community. 114 | 115 | ## Attribution 116 | 117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 118 | version 2.0, available at 119 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. 120 | 121 | Community Impact Guidelines were inspired by [Mozilla's code of conduct 122 | enforcement ladder](https://github.com/mozilla/diversity). 123 | 124 | [homepage]: https://www.contributor-covenant.org 125 | 126 | For answers to common questions about this code of conduct, see the FAQ at 127 | https://www.contributor-covenant.org/faq. Translations are available at 128 | https://www.contributor-covenant.org/translations. -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector/Data/TextDataLoader.cs: -------------------------------------------------------------------------------- 1 | namespace Build5Nines.SharpVector.Data; 2 | 3 | using System.ComponentModel.DataAnnotations; 4 | using System.Text.RegularExpressions; 5 | using Build5Nines.SharpVector.Preprocessing; 6 | 7 | public class TextDataLoader 8 | where TId : notnull 9 | where TMetadata : notnull 10 | { 11 | public TextDataLoader(IVectorDatabase vectorDatabase) 12 | { 13 | VectorDatabase = vectorDatabase; 14 | } 15 | 16 | const string _space = " "; 17 | 18 | public IVectorDatabase VectorDatabase { get; private set; } 19 | 20 | public IEnumerable AddDocument(string document, TextChunkingOptions chunkingOptions) 21 | { 22 | if (chunkingOptions.RetrieveMetadata == null) 23 | throw new ValidationException("TextChunkingOptions.RetrieveMetadata must be set"); 24 | 25 | var chunks = ChunkText(document, chunkingOptions); 26 | var ids = new List(); 27 | 28 | foreach (var chunk in chunks) 29 | { 30 | var id = VectorDatabase.AddText(chunk, chunkingOptions.RetrieveMetadata.Invoke(chunk)); 31 | ids.Add(id); 32 | } 33 | 34 | return ids; 35 | } 36 | 37 | protected List ChunkText(string text, TextChunkingOptions chunkingOptions) 38 | { 39 | switch (chunkingOptions.Method) 40 | { 41 | case TextChunkingMethod.Paragraph: 42 | return SplitIntoParagraphs(text); 43 | case TextChunkingMethod.Sentence: 44 | return SplitIntoSentences(text); 45 | case TextChunkingMethod.FixedLength: 46 | return SplitIntoChunks(text, chunkingOptions.ChunkSize); 47 | case TextChunkingMethod.OverlappingWindow: 48 | return SplitIntoOverlappingWindows(text, chunkingOptions.ChunkSize, chunkingOptions.OverlapSize); 49 | default: 50 | throw new ArgumentException("Invalid chunking method"); 51 | } 52 | } 53 | 54 | protected static List SplitIntoParagraphs(string text) 55 | { 56 | return text.Split(new[] { "\r\n\r\n", "\n\n" }, StringSplitOptions.RemoveEmptyEntries).ToList(); 57 | } 58 | 59 | protected static List SplitIntoSentences(string text) 60 | { 61 | return Regex.Split(text, @"(?<=[\.!\?])\s+").ToList(); 62 | } 63 | 64 | protected static List SplitIntoChunks(string text, int chunkSize) 65 | { 66 | var words = SplitIntoTokens(text); 67 | var chunks = new List(); 68 | 69 | for (int i = 0; i < words.Length; i += chunkSize) 70 | { 71 | chunks.Add(JoinTokens(words.Skip(i).Take(chunkSize))); 72 | } 73 | 74 | return chunks; 75 | } 76 | 77 | protected static List SplitIntoOverlappingWindows(string text, int chunkSize, int overlap) 78 | { 79 | var tokens = SplitIntoTokens(text); 80 | var chunks = new List(); 81 | 82 | if (overlap >= chunkSize) 83 | throw new ArgumentException("Overlap must be smaller than chunk size"); 84 | 85 | // Calculate the step size 86 | int step = chunkSize - overlap; 87 | int tokenLength = tokens.Length; 88 | for (int i = 0; i < tokenLength; i += step) 89 | { 90 | var chunk = JoinTokens(tokens.Skip(i).Take(chunkSize)); 91 | if (!string.IsNullOrWhiteSpace(chunk)) 92 | chunks.Add(chunk); 93 | 94 | if (i + chunkSize >= tokenLength) 95 | break; 96 | } 97 | return chunks; 98 | } 99 | 100 | private static string JoinTokens(IEnumerable tokens) 101 | { 102 | if (tokens == null) return string.Empty; 103 | 104 | var fullText = new System.Text.StringBuilder(); 105 | foreach (var token in tokens) 106 | { 107 | if (IsChinese(token)) 108 | fullText.Append(token); 109 | else 110 | fullText.Append(_space + token); 111 | } 112 | return fullText.ToString().Trim(); 113 | } 114 | 115 | private static bool IsChinese(string token) 116 | { 117 | // Checks if the token consists entirely of Chinese (CJK Unified Ideograph) characters. 118 | return System.Text.RegularExpressions.Regex.IsMatch(token, @"^\p{IsCJKUnifiedIdeographs}+$"); 119 | } 120 | 121 | protected static string[] SplitIntoTokens(string text) 122 | { 123 | var processor = new BasicTextPreprocessor(); 124 | return processor.TokenizeAndPreprocess(text).ToArray(); 125 | } 126 | 127 | public async Task> AddDocumentAsync(string document, TextChunkingOptions chunkingOptions) 128 | { 129 | if (chunkingOptions.RetrieveMetadata == null) 130 | throw new ValidationException("TextChunkingOptions.RetrieveMetadata must be set"); 131 | 132 | var chunks = await ChunkTextAsync(document, chunkingOptions); 133 | var ids = new List(); 134 | object _lock = new object(); 135 | await Parallel.ForEachAsync(chunks, async (chunk, cancellationToken) => 136 | { 137 | var id = await VectorDatabase.AddTextAsync(chunk, chunkingOptions.RetrieveMetadata.Invoke(chunk)); 138 | lock (_lock) { 139 | ids.Add(id); 140 | } 141 | }); 142 | 143 | return ids; 144 | } 145 | 146 | private async Task> ChunkTextAsync(string text, TextChunkingOptions chunkingOptions) 147 | { 148 | return await Task.Run(() => ChunkText(text, chunkingOptions)); 149 | } 150 | } 151 | -------------------------------------------------------------------------------- /src/Build5Nines.SharpVector/VectorStore/MemoryDictionaryVectorStore.cs: -------------------------------------------------------------------------------- 1 | using System.Collections; 2 | using System.Collections.Concurrent; 3 | using System.Collections.Generic; 4 | using System.Text; 5 | using System.Text.Json; 6 | 7 | namespace Build5Nines.SharpVector.VectorStore; 8 | 9 | /// 10 | /// A thread safe simple in-memory database for storing and querying vectorized text items. 11 | /// 12 | /// 13 | /// 14 | public class MemoryDictionaryVectorStore : IVectorStore 15 | where TId : notnull 16 | { 17 | private ConcurrentDictionary> _database; 18 | 19 | /// 20 | /// The number of items in the database 21 | /// 22 | public int Count => _database.Count; 23 | 24 | public MemoryDictionaryVectorStore() { 25 | _database = new ConcurrentDictionary>(); 26 | } 27 | 28 | /// 29 | /// Retrieves a text and metadata by its ID 30 | /// 31 | /// 32 | /// 33 | /// 34 | public void Set(TId id, VectorTextItem item) 35 | { 36 | _database.AddOrUpdate(id, item, (key, oldValue) => item); 37 | } 38 | 39 | /// 40 | /// Gets all the Ids for every text. 41 | /// 42 | /// 43 | public IEnumerable GetIds() 44 | { 45 | return _database.Keys; 46 | } 47 | 48 | /// 49 | /// Retrieves a text and metadata by its ID asynchronously 50 | /// 51 | /// 52 | /// 53 | /// 54 | public async Task SetAsync(TId id, VectorTextItem item) 55 | { 56 | await Task.Run(() => Set(id, item)); 57 | } 58 | 59 | /// 60 | /// Retrieves a text and metadata by its ID 61 | /// 62 | /// 63 | /// 64 | /// 65 | public IVectorTextItem Get(TId id) 66 | { 67 | if (_database.TryGetValue(id, out var entry)) 68 | { 69 | return entry; 70 | } 71 | throw new KeyNotFoundException($"Text with ID {id} not found."); 72 | } 73 | 74 | /// 75 | /// Deletes a text by its ID 76 | /// 77 | /// 78 | /// The removed text item 79 | /// 80 | public IVectorTextItem Delete(TId id) 81 | { 82 | if (_database.ContainsKey(id)) 83 | { 84 | VectorTextItem? itemRemoved; 85 | _database.Remove(id, out itemRemoved); 86 | #pragma warning disable CS8603 // Possible null reference return. 87 | return itemRemoved; 88 | #pragma warning restore CS8603 // Possible null reference return. 89 | } 90 | else 91 | { 92 | throw new KeyNotFoundException($"Text with ID {id} not found."); 93 | } 94 | } 95 | 96 | /// 97 | /// Checks if the database contains a key 98 | /// 99 | /// 100 | /// 101 | public bool ContainsKey(TId id) => _database.ContainsKey(id); 102 | 103 | 104 | 105 | 106 | public IEnumerator>> GetEnumerator() 107 | { 108 | return _database.GetEnumerator(); 109 | } 110 | 111 | System.Collections.IEnumerator System.Collections.IEnumerable.GetEnumerator() 112 | { 113 | return _database.GetEnumerator(); 114 | } 115 | 116 | #pragma warning disable CS1998 // Async method lacks 'await' operators and will run synchronously 117 | public async IAsyncEnumerator>> GetAsyncEnumerator(CancellationToken cancellationToken = default) 118 | #pragma warning restore CS1998 // Async method lacks 'await' operators and will run synchronously 119 | { 120 | foreach (var item in _database) 121 | { 122 | yield return item; 123 | } 124 | } 125 | 126 | public virtual async Task SerializeToJsonStreamAsync(Stream stream) 127 | { 128 | if (stream == null) 129 | { 130 | throw new ArgumentNullException(nameof(stream)); 131 | } 132 | 133 | await JsonSerializer.SerializeAsync>>(stream, _database); 134 | } 135 | 136 | public virtual async Task DeserializeFromJsonStreamAsync(Stream stream) 137 | { 138 | if (stream == null) 139 | { 140 | throw new ArgumentNullException(nameof(stream)); 141 | } 142 | 143 | this._database = await JsonSerializer.DeserializeAsync>>(stream) ?? new ConcurrentDictionary>(); 144 | } 145 | } 146 | 147 | /// 148 | /// A thread safe simple in-memory database for storing and querying vectorized text items. 149 | /// This is a simplified version of the MemoryDictionaryVectorStore class that uses string as the Document type. 150 | /// 151 | /// 152 | /// 153 | public class MemoryDictionaryVectorStore 154 | : MemoryDictionaryVectorStore 155 | where TId : notnull 156 | { } -------------------------------------------------------------------------------- /src/SharpVector.sln: -------------------------------------------------------------------------------- 1 | 2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio Version 17 4 | VisualStudioVersion = 17.0.31903.59 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{cae970b1-fe01-4c10-a66f-23b0383b50f5}") = "ConsoleTest", "ConsoleTest\ConsoleTest.csproj", "{B535888B-58C6-4EC5-B3E2-E900A2149065}" 7 | EndProject 8 | Project("{509f7238-2b33-467a-b94a-8a649d18df4b}") = "Build5Nines.SharpVector", "Build5Nines.SharpVector\Build5Nines.SharpVector.csproj", "{770C2E6B-4B00-4F4D-9D38-F43D299EC0E6}" 9 | EndProject 10 | Project("{23abc664-2b8b-4c01-80d2-145553f45972}") = "SharpVectorTest", "SharpVectorTest\SharpVectorTest.csproj", "{42ff2370-2d81-4384-be82-35fd11d7dab8}" 11 | EndProject 12 | Project("{c2585652-05bb-43a4-b96c-d45fee8bc629}") = "SharpVectorPerformance", "SharpVectorPerformance\SharpVectorPerformance.csproj", "{AFF76051-E043-45EB-9B5F-05D9C45D0DC7}" 13 | EndProject 14 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Build5Nines.SharpVector.OpenAI", "Build5Nines.SharpVector.OpenAI\Build5Nines.SharpVector.OpenAI.csproj", "{CABF1DBE-8FE1-4EDF-B5DD-B1BFB88D93C3}" 15 | EndProject 16 | Project("{d241a75f-12b7-476d-8ad0-3fc3eae491ea}") = "Build5Nines.SharpVector.Ollama", "Build5Nines.SharpVector.Ollama\Build5Nines.SharpVector.Ollama.csproj", "{f64a2af6-c0cd-41cf-879e-db5ef9c33375}" 17 | EndProject 18 | Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "SharpVectorOpenAITest", "SharpVectorOpenAITest\SharpVectorOpenAITest.csproj", "{04E08FA2-C4B4-47B4-ABB0-6FD57EA5FFFB}" 19 | EndProject 20 | Project("{953bc932-6273-4fcc-8789-50057e494d39}") = "OpenAIConsoleTest", "OpenAIConsoleTest\OpenAIConsoleTest.csproj", "{0B5B1D8C-9D1C-4779-880E-09B8F1BD1DD2}" 21 | EndProject 22 | Project("{619c7671-0831-4096-b1bf-39c6ee0595a4}") = "OllamaConsoleTest", "OllamaConsoleTest\OllamaConsoleTest.csproj", "{e211095e-33d5-4ba4-a9f7-9d6057d807a7}" 23 | EndProject 24 | Project("{b170972d-92ac-48a4-af26-e44dff4801aa}") = "Build5Nines.SharpVector.Playground", "Build5Nines.SharpVector.Playground\Build5Nines.SharpVector.Playground.csproj", "{DF4912BA-17B3-458B-B4D9-AD75287EAC45}" 25 | EndProject 26 | Global 27 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 28 | Debug|Any CPU = Debug|Any CPU 29 | Release|Any CPU = Release|Any CPU 30 | EndGlobalSection 31 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 32 | {B535888B-58C6-4EC5-B3E2-E900A2149065}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 33 | {B535888B-58C6-4EC5-B3E2-E900A2149065}.Debug|Any CPU.Build.0 = Debug|Any CPU 34 | {B535888B-58C6-4EC5-B3E2-E900A2149065}.Release|Any CPU.ActiveCfg = Release|Any CPU 35 | {B535888B-58C6-4EC5-B3E2-E900A2149065}.Release|Any CPU.Build.0 = Release|Any CPU 36 | {770C2E6B-4B00-4F4D-9D38-F43D299EC0E6}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 37 | {770C2E6B-4B00-4F4D-9D38-F43D299EC0E6}.Debug|Any CPU.Build.0 = Debug|Any CPU 38 | {770C2E6B-4B00-4F4D-9D38-F43D299EC0E6}.Release|Any CPU.ActiveCfg = Release|Any CPU 39 | {770C2E6B-4B00-4F4D-9D38-F43D299EC0E6}.Release|Any CPU.Build.0 = Release|Any CPU 40 | {42ff2370-2d81-4384-be82-35fd11d7dab8}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 41 | {42ff2370-2d81-4384-be82-35fd11d7dab8}.Debug|Any CPU.Build.0 = Debug|Any CPU 42 | {42ff2370-2d81-4384-be82-35fd11d7dab8}.Release|Any CPU.ActiveCfg = Release|Any CPU 43 | {42ff2370-2d81-4384-be82-35fd11d7dab8}.Release|Any CPU.Build.0 = Release|Any CPU 44 | {AFF76051-E043-45EB-9B5F-05D9C45D0DC7}.Release|Any CPU.ActiveCfg = Release|Any CPU 45 | {AFF76051-E043-45EB-9B5F-05D9C45D0DC7}.Release|Any CPU.Build.0 = Release|Any CPU 46 | {AFF76051-E043-45EB-9B5F-05D9C45D0DC7}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 47 | {AFF76051-E043-45EB-9B5F-05D9C45D0DC7}.Debug|Any CPU.Build.0 = Debug|Any CPU 48 | {CABF1DBE-8FE1-4EDF-B5DD-B1BFB88D93C3}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 49 | {CABF1DBE-8FE1-4EDF-B5DD-B1BFB88D93C3}.Debug|Any CPU.Build.0 = Debug|Any CPU 50 | {CABF1DBE-8FE1-4EDF-B5DD-B1BFB88D93C3}.Release|Any CPU.ActiveCfg = Release|Any CPU 51 | {CABF1DBE-8FE1-4EDF-B5DD-B1BFB88D93C3}.Release|Any CPU.Build.0 = Release|Any CPU 52 | {f64a2af6-c0cd-41cf-879e-db5ef9c33375}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 53 | {f64a2af6-c0cd-41cf-879e-db5ef9c33375}.Debug|Any CPU.Build.0 = Debug|Any CPU 54 | {f64a2af6-c0cd-41cf-879e-db5ef9c33375}.Release|Any CPU.ActiveCfg = Release|Any CPU 55 | {f64a2af6-c0cd-41cf-879e-db5ef9c33375}.Release|Any CPU.Build.0 = Release|Any CPU 56 | {04E08FA2-C4B4-47B4-ABB0-6FD57EA5FFFB}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 57 | {04E08FA2-C4B4-47B4-ABB0-6FD57EA5FFFB}.Debug|Any CPU.Build.0 = Debug|Any CPU 58 | {04E08FA2-C4B4-47B4-ABB0-6FD57EA5FFFB}.Release|Any CPU.ActiveCfg = Release|Any CPU 59 | {04E08FA2-C4B4-47B4-ABB0-6FD57EA5FFFB}.Release|Any CPU.Build.0 = Release|Any CPU 60 | {0B5B1D8C-9D1C-4779-880E-09B8F1BD1DD2}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 61 | {0B5B1D8C-9D1C-4779-880E-09B8F1BD1DD2}.Debug|Any CPU.Build.0 = Debug|Any CPU 62 | {0B5B1D8C-9D1C-4779-880E-09B8F1BD1DD2}.Release|Any CPU.ActiveCfg = Release|Any CPU 63 | {0B5B1D8C-9D1C-4779-880E-09B8F1BD1DD2}.Release|Any CPU.Build.0 = Release|Any CPU 64 | {DF4912BA-17B3-458B-B4D9-AD75287EAC45}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 65 | {DF4912BA-17B3-458B-B4D9-AD75287EAC45}.Debug|Any CPU.Build.0 = Debug|Any CPU 66 | {DF4912BA-17B3-458B-B4D9-AD75287EAC45}.Release|Any CPU.ActiveCfg = Release|Any CPU 67 | {DF4912BA-17B3-458B-B4D9-AD75287EAC45}.Release|Any CPU.Build.0 = Release|Any CPU 68 | {e211095e-33d5-4ba4-a9f7-9d6057d807a7}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 69 | {e211095e-33d5-4ba4-a9f7-9d6057d807a7}.Debug|Any CPU.Build.0 = Debug|Any CPU 70 | {e211095e-33d5-4ba4-a9f7-9d6057d807a7}.Release|Any CPU.ActiveCfg = Release|Any CPU 71 | {e211095e-33d5-4ba4-a9f7-9d6057d807a7}.Release|Any CPU.Build.0 = Release|Any CPU 72 | EndGlobalSection 73 | GlobalSection(SolutionProperties) = preSolution 74 | HideSolutionNode = FALSE 75 | EndGlobalSection 76 | GlobalSection(ExtensibilityGlobals) = postSolution 77 | SolutionGuid = {861BA76B-E825-4CC4-81F7-97A00FA2BD48} 78 | EndGlobalSection 79 | EndGlobal 80 | -------------------------------------------------------------------------------- /src/SharpVectorTest/Preprocessing/BasicTextPreprocessorTests.cs: -------------------------------------------------------------------------------- 1 | namespace SharpVectorTest.Preprocessing; 2 | 3 | using System.Diagnostics; 4 | using System.Threading.Tasks; 5 | using Build5Nines.SharpVector; 6 | using Build5Nines.SharpVector.Embeddings; 7 | using Build5Nines.SharpVector.Id; 8 | using Build5Nines.SharpVector.Preprocessing; 9 | using Build5Nines.SharpVector.VectorCompare; 10 | using Build5Nines.SharpVector.Vectorization; 11 | using Build5Nines.SharpVector.VectorStore; 12 | using Build5Nines.SharpVector.Vocabulary; 13 | 14 | [TestClass] 15 | public class VectorDatabaseTests 16 | { 17 | [TestMethod] 18 | public void TokenizeAndPreprocess_Null() 19 | { 20 | var preprocessor = new BasicTextPreprocessor(); 21 | #pragma warning disable CS8625 // Cannot convert null literal to non-nullable reference type. 22 | var tokens = preprocessor.TokenizeAndPreprocess(null); 23 | #pragma warning restore CS8625 // Cannot convert null literal to non-nullable reference type. 24 | 25 | Assert.AreEqual(0, tokens.Count()); 26 | } 27 | 28 | [TestMethod] 29 | public void TokenizeAndPreprocess_Empty() 30 | { 31 | var preprocessor = new BasicTextPreprocessor(); 32 | var tokens = preprocessor.TokenizeAndPreprocess(string.Empty); 33 | 34 | Assert.AreEqual(0, tokens.Count()); 35 | } 36 | 37 | [TestMethod] 38 | public void TokenizeAndPreprocess_Whitespace() 39 | { 40 | var preprocessor = new BasicTextPreprocessor(); 41 | var tokens = preprocessor.TokenizeAndPreprocess(" "); 42 | 43 | Assert.AreEqual(0, tokens.Count()); 44 | } 45 | 46 | [TestMethod] 47 | public void TokenizeAndPreprocess_Punctuation_01() 48 | { 49 | var preprocessor = new BasicTextPreprocessor(); 50 | var tokens = preprocessor.TokenizeAndPreprocess("Hello.!@#$%^&*()`~世-_=+ 界{}[]|:;\"',.<>/?!"); 51 | 52 | var expectedTokens = new List { "hello", "世", "界"}; 53 | for(var i = 0; i < expectedTokens.Count; i++) 54 | { 55 | Assert.AreEqual(expectedTokens[i], tokens.ElementAt(i), $"Index: {i} does not match"); 56 | } 57 | } 58 | 59 | [TestMethod] 60 | public void TokenizeAndPreprocess_Punctuation_02() 61 | { 62 | var preprocessor = new BasicTextPreprocessor(); 63 | var tokens = preprocessor.TokenizeAndPreprocess("Hello.!@#$%^&*()`~-_=+{}[]|:;\"',.<>/?"); 64 | 65 | var expectedTokens = new List { "hello" }; 66 | for(var i = 0; i < expectedTokens.Count; i++) 67 | { 68 | Assert.AreEqual(expectedTokens[i], tokens.ElementAt(i), $"Index: {i} does not match"); 69 | } 70 | } 71 | 72 | [TestMethod] 73 | public void TokenizeAndPreprocess_Punctuation_03() 74 | { 75 | var preprocessor = new BasicTextPreprocessor(); 76 | var tokens = preprocessor.TokenizeAndPreprocess("Hello.🔥!@#$%^&*()`~世-_=+ 界{}[]|:;\"',.<>/?"); 77 | 78 | var expectedTokens = new List { "hello", "🔥", "世", "界"}; 79 | for(var i = 0; i < expectedTokens.Count; i++) 80 | { 81 | Assert.AreEqual(expectedTokens[i], tokens.ElementAt(i), $"Index: {i} does not match"); 82 | } 83 | } 84 | 85 | [TestMethod] 86 | public void TokenizeAndPreprocess_Punctuation_04() 87 | { 88 | var preprocessor = new BasicTextPreprocessor(); 89 | var tokens = preprocessor.TokenizeAndPreprocess("Hello.!@#🔥$%^&*()`~-_=+{}[]|:;\"',.<>/?"); 90 | 91 | var expectedTokens = new List { "hello", "🔥" }; 92 | for(var i = 0; i < expectedTokens.Count; i++) 93 | { 94 | Assert.AreEqual(expectedTokens[i], tokens.ElementAt(i), $"Index: {i} does not match"); 95 | } 96 | } 97 | 98 | [TestMethod] 99 | public void TokenizeAndPreprocess_01() 100 | { 101 | var preprocessor = new BasicTextPreprocessor(); 102 | var tokens = preprocessor.TokenizeAndPreprocess("Hello, world! 你好,世界!"); 103 | 104 | var expectedTokens = new List { "hello", "world", "你", "好", "世", "界" }; 105 | for(var i = 0; i < expectedTokens.Count; i++) 106 | { 107 | Assert.AreEqual(expectedTokens[i], tokens.ElementAt(i), $"Index: {i} does not match"); 108 | } 109 | } 110 | 111 | [TestMethod] 112 | public void TokenizeAndPreprocess_02() 113 | { 114 | var preprocessor = new BasicTextPreprocessor(); 115 | var tokens = preprocessor.TokenizeAndPreprocess("Hello, World! How are you?"); 116 | 117 | var expectedTokens = new List { "hello", "world", "how", "are", "you" }; 118 | for(var i = 0; i < expectedTokens.Count; i++) 119 | { 120 | Assert.AreEqual(expectedTokens[i], tokens.ElementAt(i), $"Index: {i} does not match"); 121 | } 122 | } 123 | 124 | [TestMethod] 125 | public void TokenizeAndPreprocess_03() 126 | { 127 | var preprocessor = new BasicTextPreprocessor(); 128 | var tokens = preprocessor.TokenizeAndPreprocess("Hello, World! 👑🔥 How are you? 🔥."); 129 | 130 | var expectedTokens = new List { "hello", "world", "👑", "🔥", "how", "are", "you", "🔥" }; 131 | for(var i = 0; i < expectedTokens.Count; i++) 132 | { 133 | Assert.AreEqual(expectedTokens[i], tokens.ElementAt(i), $"Index: {i} does not match ::" + String.Join("-", tokens)); 134 | } 135 | } 136 | 137 | [TestMethod] 138 | public void TokenizeAndPreprocess_04() 139 | { 140 | var preprocessor = new BasicTextPreprocessor(); 141 | var tokens = preprocessor.TokenizeAndPreprocess("Hello, world! 👑🔥你好,世界!👑 "); 142 | 143 | var expectedTokens = new List { "hello", "world", "👑", "🔥", "你", "好", "世", "界", "👑" }; 144 | for(var i = 0; i < expectedTokens.Count; i++) 145 | { 146 | Assert.AreEqual(expectedTokens[i], tokens.ElementAt(i), $"Index: {i} does not match ::" + String.Join("-", tokens)); 147 | } 148 | } 149 | } --------------------------------------------------------------------------------