├── .gitignore ├── CompactVectorSearch.csproj ├── Dockerfile ├── LICENSE ├── Program.cs ├── README.md ├── application_env ├── assets ├── FunctionCollection.json └── functions.csv ├── media ├── header.png └── swagger.png ├── src ├── Api │ └── Controllers │ │ ├── CompactVectorController.cs │ │ ├── SearchController.cs │ │ └── VectorDbService.cs └── VectorLibrary │ ├── Collections │ └── VectorCollection.cs │ ├── Interfaces │ └── IVector.cs │ ├── Models │ ├── FunctionCodePair.cs │ └── SearchResult.cs │ └── Utils │ ├── StoreUtility.cs │ └── VectorMath.cs └── tests └── test.rest /.gitignore: -------------------------------------------------------------------------------- 1 | bin/ 2 | obj/ 3 | *.env 4 | .DS_Store 5 | -------------------------------------------------------------------------------- /CompactVectorSearch.csproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | net8.0 5 | enable 6 | enable 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Use the official ASP.NET Core runtime image 2 | FROM mcr.microsoft.com/dotnet/aspnet:8.0 AS base 3 | WORKDIR /app 4 | EXPOSE 80 5 | 6 | ENV ASPNETCORE_URLS=http://+:80 7 | 8 | 9 | # Use the SDK image to build the project 10 | FROM mcr.microsoft.com/dotnet/sdk:8.0 AS build 11 | WORKDIR /src 12 | 13 | # Copy the CSPROJ file and restore any NuGet packages 14 | COPY ["CompactVectorSearch.csproj", "./"] 15 | RUN dotnet restore "CompactVectorSearch.csproj" 16 | 17 | # Copy the rest of your source code 18 | COPY . . 19 | 20 | # Build the project 21 | RUN dotnet build "CompactVectorSearch.csproj" -c Release -o /app/build 22 | 23 | # Publish the application 24 | FROM build AS publish 25 | RUN dotnet publish "CompactVectorSearch.csproj" -c Release -o /app/publish 26 | 27 | # Final stage/image 28 | FROM base AS final 29 | WORKDIR /app 30 | COPY --from=publish /app/publish . 31 | COPY application.env ./ 32 | 33 | ENTRYPOINT ["dotnet", "CompactVectorSearch.dll"] 34 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Yoav Dobrin 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Program.cs: -------------------------------------------------------------------------------- 1 | using DotNetEnv; 2 | using System.IO; 3 | using VectorLibrary; 4 | using VectorApi; 5 | var builder = WebApplication.CreateBuilder(args); 6 | 7 | string configurationFile = Path.Combine(Directory.GetCurrentDirectory(), "application.env"); 8 | Env.Load(configurationFile); 9 | 10 | // Add services to the container. 11 | builder.Services.AddSingleton(); 12 | builder.Services.AddControllers(); 13 | 14 | string dbFileName = Environment.GetEnvironmentVariable("DB_FILE_NAME") ?? "FunctionCollection.json"; // default for ease of use 15 | 16 | builder.Services.AddEndpointsApiExplorer(); 17 | builder.Services.AddSwaggerGen(); 18 | 19 | var app = builder.Build(); 20 | 21 | // Resolve the service 22 | var vectorDbService = app.Services.GetRequiredService(); 23 | // Perform the initialization 24 | await vectorDbService.InitializeAsync(dbFileName); 25 | 26 | // Configure the HTTP request pipeline. 27 | app.UseAuthorization(); 28 | app.UseHttpsRedirection(); 29 | app.UseSwagger(); 30 | app.UseSwaggerUI(); 31 | 32 | app.MapControllers(); 33 | 34 | app.Run(); 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CompactVectorSearch: High-Speed Vector Database for .NET 2 | 3 | ![](./media/header.png) 4 | 5 | **CompactVectorSearch** is a .NET library designed for fast and accurate vector similarity searches in compact datasets. Perfect for small-scale applications, it excels in environments where large-scale solutions are overkill. This C# implementation offers a unique blend of performance, precision, and ease of use. 6 | 7 | ## Overview 8 | 9 | CompactVectorSearch is a .NET vector database designed for efficient and precise semantic similarity searches in compact datasets. It can handle up to 200 items, delivering exceptional speed and accuracy. The system uses embedding vectors to represent text elements, which enables it to perform semantic similarity searches using cosine distance, Euclidean distance, or dot product. This process allows users to quickly and accurately identify the most semantically relevant items in the database. 10 | 11 | ### Embedding and Data Processing Workflow 12 | 13 | - **Initial Data Preparation**: The process begins by taking a CSV file containing descriptions of functions or other text elements. 14 | - **Vector Embedding with Azure OpenAI**: Each description is then transformed into an embedding vector using Azure OpenAI's embedding model (specifically, the Ada-02 model). This model excels in capturing the semantic essence of text data. 15 | - **JSON Storage**: After embedding, the vector data is saved as a JSON file on a storage system (I used Azure storage in this sample). This method ensures that the data is easily accessible and manageable. 16 | - **In-Memory Data Handling**: Upon initialization, CompactVectorSearch loads this JSON data into memory. This approach allows for rapid access and search capabilities within the database. 17 | - **Dynamic Query Processing**: When a search query is received, the system first converts the query text into an embedding vector using the same Ada-02 model. This ensures consistency in the representation of both the database items and the query. 18 | - **Search Execution**: The embedded query is then compared against the database vectors using the chosen similarity measure (cosine distance, dot product, etc.). This comparison identifies the most semantically relevant database item to the query. 19 | 20 | ### Key Features 21 | 22 | - **Efficient Semantic Searches**: Perform semantic similarity checks using cosine distance, Euclidean distance, or dot product. 23 | - **Optimized for Small Datasets**: Specifically designed for datasets with fewer than 200 items, ensuring rapid response times, typically under 20ms. 24 | - **Custom Code Base**: Built from the ground up with custom code, minimizing memory and code footprint and avoiding dependency on heavy external packages. 25 | 26 | ## Sample Use Cases 27 | 28 | - **Natural Language to API Call Mapping**: 29 | - **Customer-Oriented Function Selection**: CompactVectorSearch revolutionizes how clients interact with a repository of API functions. Instead of navigating through complex technical documentation or requiring precise function names, customers can simply use natural language to describe their needs. 30 | - **Semantic Matching for API Execution**: For instance, consider a repository with diverse functions like calculating average temperatures for a region or generating commission statements for agents. When a customer inputs a query in natural language, CompactVectorSearch analyzes the semantic closeness of this input to the descriptions of available API functions. If the customer's query semantically aligns more closely with a weather forecast request, the corresponding API for weather data is automatically triggered and executed. 31 | - **Enhancing User Experience and Accessibility**: This feature significantly enhances the user experience, making the system more accessible and user-friendly. It allows non-technical users to interact with complex systems using simple, everyday language, thereby bridging the gap between technical functionality and practical usability. 32 | - **Customer Service Enhancement**: Assists in identifying the right solutions quickly in customer service scenarios, either via chatbots or human interaction. 33 | 34 | ## How It Works 35 | 36 | CompactVectorSearch operates on the principle of vector space modeling, transforming text elements into embedding vectors. This conversion allows the library to perform semantic similarity checks efficiently. Here's a breakdown of its core functionality: 37 | 38 | ![swagger](./media/swagger.png) 39 | 40 | ### Vector Representation 41 | 42 | - **Text to Vector Transformation**: Each text element is converted into an embedding vector. These vectors represent the semantic essence of the text, making them ideal for similarity checks. 43 | 44 | ### Semantic Similarity Checks 45 | 46 | **Semantic Similarity Checks**: This refers to the process of determining how closely related or similar two pieces of content (typically text) are. In the context of vectors, this content is represented as vectors in a multidimensional space. There are three methods of measuring similarity implemented in CompactVectorSearch: 47 | 48 | - **Using Cosine Distance**: This is a method to measure similarity by calculating the cosine of the angle between two vectors. If the cosine value is close to 1, it indicates high similarity. This method effectively captures the orientation (but not magnitude) of vectors and is widely used in text analysis. 49 | 50 | - **Euclidean Distance**: This is another method of measuring similarity, which calculates the 'straight-line' distance between two points (or vectors) in space. It is the most direct way of measuring distance, but in high-dimensional spaces (like those often involved in text analysis), it can sometimes be less effective due to the curse of dimensionality. 51 | 52 | - **Dot Product**: This is a measure that multiplies corresponding entries of two vectors and sums up the results. In the context of similarity checks, a higher dot product can indicate more similarity, especially when vectors are normalized. 53 | 54 | Each of these methods offers a different perspective on how vectors (and thus, the text or data they represent) can be considered similar. They are commonly used in various applications, including information retrieval, natural language processing, and data analysis. Your sentence correctly implies that CompactVectorSearch can utilize any of these methods to perform semantic similarity checks. 55 | 56 | ### Efficient Search in Compact Databases 57 | 58 | - **Optimized for Less than 200 Items**: Specifically designed for small datasets, CompactVectorSearch maintains high performance even with a limit of 200 items. 59 | - **Rapid Response Time**: Thanks to its efficient design and the inherent simplicity of small datasets, the response time for a query is typically less than 20ms. 60 | 61 | ### Custom Implementation 62 | - **Minimal Footprint**: The library is built with custom code, eschewing bulky packages for a leaner memory and code footprint. 63 | - **Fast and Lightweight**: Its streamlined design ensures that searches are not only accurate but also remarkably fast. 64 | 65 | This unique combination of features makes CompactVectorSearch an invaluable tool in scenarios where precision, speed, and efficiency are paramount, especially in constrained environments like small datasets or memory-limited applications. 66 | 67 | ## Getting Started 68 | 69 | ### Docker Integration 70 | 71 | CompactVectorSearch comes with an included Dockerfile, which encapsulates the entire library and its functionalities. This Docker container is designed to expose several search routes, each corresponding to a different similarity measure method (cosine distance, Euclidean distance, and dot product). This setup serves as a practical example of how CompactVectorSearch can be integrated into a larger system. 72 | 73 | ### Implementing as a Service 74 | 75 | - **Minimal Memory Footprint**: One of the core advantages of CompactVectorSearch is its minimal memory requirement. This makes it ideal for deployment in environments with limited resources or where multiple services need to run concurrently without impacting each other's performance. 76 | 77 | - **Usage as a Database Service (DBService)**: The library is intended to be used as a DBService within your web API server. This approach allows other services in your system to interact with CompactVectorSearch as if it were a standard database service. 78 | 79 | - **Singleton Pattern**: For optimal performance and resource management, it is recommended to implement CompactVectorSearch as a singleton service within your application. This ensures that only one instance of the service is created and maintained throughout the application's lifecycle, thereby reducing memory usage and improving response times. Each component is thread safe. 80 | 81 | This setup enables CompactVectorSearch to function seamlessly within your application, providing a fast, efficient, and scalable solution for vector similarity searches in small datasets. 82 | 83 | ## Repository Structure 84 | 85 | This section provides an overview of the key directories and files in the CompactVectorSearch repository. Understanding this structure will help you navigate and utilize the project effectively. 86 | 87 | ### Project Overview 88 | 89 | - `CompactVectorSearch/`: Root directory of the project. 90 | 91 | #### Key Directories 92 | 93 | - `assets/`: Contains sample CSV and JSON files (`function.csv` and `FunctionCollection.json`) used for testing and demonstration purposes. 94 | - `tests/`: Includes testing files such as `test.rest` for testing your endpoints. 95 | - `media/`: Holds media files like `header.png` used in the project documentation. 96 | 97 | #### Source Code 98 | 99 | - `src/`: The main source code of the project. 100 | - `Api/`: Contains the API logic of the application. 101 | - `Controllers/`: Houses controller classes like `CompactVectorController` and `SearchController`. 102 | - `VectorDbService.cs`: Service for handling vector database operations. 103 | - `VectorLibrary/`: Core library of the project. 104 | - `Collections/`: Contains collections used in the project. 105 | - `Interfaces/`: Holds interface definitions. 106 | - `Models/`: Contains data models. 107 | - `Utils/`: Utility functions and helpers. 108 | 109 | #### Configuration and Documentation 110 | 111 | - `Dockerfile`: Docker configuration for setting up the project environment. 112 | - `application.env`: Contains environment variables used by the application. 113 | - `Program.cs`: The entry point of the .NET application. 114 | - `README.md`: The main documentation file for the project, detailing usage, setup, and other important information. 115 | - `LICENSE.md`: Information about the project's license. 116 | - `CompactVectorSearch.csproj`: The .NET project file containing configuration and dependencies. 117 | -------------------------------------------------------------------------------- /application_env: -------------------------------------------------------------------------------- 1 | SKIT_AOAI_APIKEY = 2 | SKIT_AOAI_ENDPOINT = 3 | SKIT_EMBEDDING_DEPLOYMENTNAME = 4 | BLOB_CONTAINER_NAME= 5 | BLOB_FILE_NAME= 6 | BLOB_STORAGE_CS="" 7 | -------------------------------------------------------------------------------- /assets/functions.csv: -------------------------------------------------------------------------------- 1 | id,FunctionDescription,FunctionCode 2 | 1,"Calculates the average cost of all orders by summing their total amounts and dividing by the number of orders to identify pricing trends.","function calcAvgOrderCost() {...}" 3 | 2,"Determines the product with the highest number of sales, providing insights into customer preferences and market demand.","function highestSellingProduct() {...}" 4 | 3,"Computes the total sales volume within a specified time frame, helping in financial planning and analysis.","function totalSales(period) {...}" 5 | 4,"Generates detailed monthly sales reports, outlining trends, key metrics, and insights for strategic decision-making.","function monthlySalesReport() {...}" 6 | 5,"Calculates the percentage increase or decrease in sales over a period, indicating market performance and effectiveness of sales strategies.","function salesGrowthPercentage() {...}" 7 | 6,"Identifies sales regions with the lowest performance, highlighting areas needing strategic improvement.","function leastPerformingRegion() {...}" 8 | 7,"Automatically calculates sales commissions based on individual performance metrics, ensuring fair and prompt compensation.","function calculateCommissions() {...}" 9 | 8,"Predicts future sales based on historical data and trends, aiding in inventory management and marketing strategies.","function predictFutureSales() {...}" 10 | 9,"Analyzes patterns in customer purchases, helping to tailor marketing efforts and product offerings.","function analyzePurchasePatterns() {...}" 11 | 10,"Determines the profit margin for each product, crucial for pricing strategies and financial health assessment.","function profitMargin() {...}" 12 | 11,"Evaluates the success and ROI of sales campaigns, providing insights for future marketing initiatives.","function salesCampaignEffectiveness() {...}" 13 | 12,"Computes the average amount spent by customers, useful for segmentation and targeted marketing.","function averageCustomerSpend() {...}" 14 | 13,"Identifies potential opportunities for cross-selling products to customers, enhancing sales and customer relationships.","function crossSellingOpportunities() {...}" 15 | 14,"Assesses the financial impact of discounting strategies on overall sales volume and profitability.","function discountImpactAnalysis() {...}" 16 | 15,"Tracks the rate at which inventory is sold and replaced, highlighting supply chain efficiencies or issues.","function inventoryTurnover() {...}" 17 | 16,"Analyzes conversion rates at each stage of the sales funnel to identify bottlenecks and opportunities for improvement.","function salesFunnelConversion() {...}" 18 | 17,"Calculates the ROI of marketing efforts, aligning marketing spend with sales outcomes.","function marketingROICalculation() {...}" 19 | 18,"Determines the sales volume needed to cover the costs of launching new products, guiding pricing and marketing strategies.","function breakEvenPointCalculation() {...}" 20 | 19,"Evaluates the long-term value of customers, informing customer retention strategies and resource allocation.","function customerLifetimeValue() {...}" 21 | 20,"Measures the efficiency and effectiveness of the sales order process, identifying areas for operational improvement.","function salesOrderProcessEfficiency() {...}" 22 | -------------------------------------------------------------------------------- /media/header.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yodobrin/CompactVectorSearch/d02b279675fc1f907d21deb2af01fe9f91bd5bb7/media/header.png -------------------------------------------------------------------------------- /media/swagger.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yodobrin/CompactVectorSearch/d02b279675fc1f907d21deb2af01fe9f91bd5bb7/media/swagger.png -------------------------------------------------------------------------------- /src/Api/Controllers/CompactVectorController.cs: -------------------------------------------------------------------------------- 1 | using Microsoft.AspNetCore.Mvc; 2 | 3 | namespace VectorApi 4 | { 5 | [ApiController] 6 | [Route("[controller]")] 7 | public class CompactVectorController : ControllerBase 8 | { 9 | private readonly VectorDbService _dbService; 10 | public CompactVectorController(VectorDbService dbService) 11 | { 12 | _dbService = dbService; 13 | } 14 | // Implement your API logic here 15 | 16 | [HttpGet("reload")] 17 | public async Task Get(string jsonFileName) 18 | { 19 | await _dbService.InitializeAsync(jsonFileName); 20 | return Ok("Reloaded DB"); 21 | } 22 | // route that accepts a csv file name and json file to be created 23 | 24 | [HttpGet("load")] 25 | public async Task Get(string csvFileName, string jsonFileName) 26 | { 27 | await _dbService.CreateDatabaseAsync(csvFileName,jsonFileName); 28 | return Ok($"Read CSV {csvFileName} and created DB {jsonFileName}"); 29 | } 30 | 31 | // ...other actions... 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/Api/Controllers/SearchController.cs: -------------------------------------------------------------------------------- 1 | using Microsoft.AspNetCore.Mvc; 2 | 3 | namespace VectorApi 4 | { 5 | [ApiController] 6 | [Route("search")] // Controller-level route 7 | public class SearchController : ControllerBase 8 | { 9 | private readonly VectorDbService _dbService; 10 | 11 | public SearchController(VectorDbService dbService) 12 | { 13 | _dbService = dbService; 14 | } 15 | 16 | [HttpGet("cosine")] 17 | public async Task SearchByCosineSimilarity(string query) 18 | { 19 | var results = await _dbService.SearchByCosineSimilarity(query); 20 | return Ok(results); 21 | } 22 | 23 | [HttpGet("dotproduct")] 24 | public async Task SearchByDotProduct(string query) 25 | { 26 | var results = await _dbService.SearchByDotProduct(query); 27 | return Ok(results); 28 | } 29 | 30 | [HttpGet("euclidean")] 31 | public async Task SearchByEuclideanDistance(string query) 32 | { 33 | var results = await _dbService.SearchByEuclideanDistance(query); 34 | return Ok(results); 35 | } 36 | } 37 | } -------------------------------------------------------------------------------- /src/Api/Controllers/VectorDbService.cs: -------------------------------------------------------------------------------- 1 | using VectorLibrary; 2 | using Azure.Storage.Blobs; 3 | using System.IO; 4 | using Azure.AI.OpenAI; 5 | using Azure; 6 | 7 | namespace VectorApi 8 | { 9 | public class VectorDbService 10 | { 11 | public VectorCollection ? VectorCollection { get; private set; } 12 | private OpenAIClient ? _openAIClient; 13 | private string ? _embeddingDeploymentName; 14 | 15 | public VectorDbService() 16 | { 17 | Console.WriteLine("VectorDbService constructor called"); 18 | } 19 | public async Task SearchByDotProduct(string query) 20 | { 21 | // check the vector collection is not null throw exception 22 | if (VectorCollection == null) 23 | { 24 | throw new Exception("VectorCollection is null"); 25 | } 26 | var queryVector = await GetEmbeddings(query); 27 | return VectorCollection.FindByDotProduct(queryVector, item => item.GetVector()); 28 | } 29 | public async Task SearchByCosineSimilarity(string query) 30 | { 31 | // check the vector collection is not null throw exception 32 | if (VectorCollection == null) 33 | { 34 | throw new Exception("VectorCollection is null"); 35 | } 36 | var queryVector = await GetEmbeddings(query); 37 | return VectorCollection.FindByCosineSimilarity(queryVector, item => item.GetVector()); 38 | } 39 | public async Task SearchByEuclideanDistance(string query) 40 | { 41 | // check the vector collection is not null throw exception 42 | if (VectorCollection == null) 43 | { 44 | throw new Exception("VectorCollection is null"); 45 | } 46 | var queryVector = await GetEmbeddings(query); 47 | return VectorCollection.FindByEuclideanDistance(queryVector, item => item.GetVector()); 48 | } 49 | public async Task CreateDatabaseAsync(string inputCsvFileName, string outputJsonFileName) 50 | { 51 | // check for null on embeddingDeploymentName & openAIClient throw exception 52 | if (string.IsNullOrEmpty(_embeddingDeploymentName) || _openAIClient == null) 53 | { 54 | throw new Exception("OpenAI Client or Embedding Deployment Name is null"); 55 | } 56 | List functionCodePairs = await StoreUtility.LoadFunctionCodePairsFromAzureBlobAsync(inputCsvFileName,_openAIClient,_embeddingDeploymentName); 57 | await StoreUtility.SaveFunctionCodePairsToAzureBlobAsync(functionCodePairs, outputJsonFileName); 58 | 59 | } 60 | 61 | public async Task SaveFunctionCodePairsToBlobAsync(string jsonFileName) 62 | { 63 | if (VectorCollection == null) 64 | { 65 | throw new InvalidOperationException("VectorCollection is not initialized."); 66 | } 67 | // Call the method to save the function code pairs to Azure Blob 68 | await StoreUtility.SaveFunctionCodePairsToAzureBlobAsync(VectorCollection.GetFunctionCodePairs(), jsonFileName); 69 | } 70 | 71 | private async Task GetEmbeddings(string query) 72 | { 73 | // null check for embeddingDeploymentName & openAIClient throw exception 74 | if (_embeddingDeploymentName == null || _openAIClient == null) 75 | { 76 | throw new Exception("OpenAI Client or Embedding Deployment Name is null"); 77 | } 78 | 79 | EmbeddingsOptions embeddingsOptions = new EmbeddingsOptions(_embeddingDeploymentName,new List { query }); 80 | var embeddingsResponse = await _openAIClient.GetEmbeddingsAsync(embeddingsOptions); 81 | return embeddingsResponse.Value.Data[0].Embedding.ToArray(); 82 | } 83 | private async Task LoadDataFromBlobStorage(string jsonFileName) 84 | { 85 | string accountConnectionString = Environment.GetEnvironmentVariable("BLOB_STORAGE_CS") ?? "BLOB_STORAGE_CS not found"; 86 | string containerName = Environment.GetEnvironmentVariable("BLOB_CONTAINER_NAME") ?? "BLOB_CONTAINER_NAME not found"; 87 | 88 | // check if any of the above are null 89 | if (accountConnectionString == "BLOB_STORAGE_CS not found" || containerName == "BLOB_CONTAINER_NAME not found" || string.IsNullOrEmpty(jsonFileName)) 90 | { 91 | Console.WriteLine("One or more environment variables are not set. Please set BLOB_STORAGE_CS, BLOB_CONTAINER_NAME and BLOB_FILE_NAME"); 92 | return; 93 | } 94 | BlobServiceClient blobServiceClient = new BlobServiceClient(accountConnectionString); 95 | 96 | var containerClient = blobServiceClient.GetBlobContainerClient(containerName); 97 | BlobClient blobClient = containerClient.GetBlobClient(jsonFileName); 98 | 99 | if (blobClient.Exists()) 100 | { 101 | var response = await blobClient.DownloadContentAsync(); 102 | using var stream = response.Value.Content.ToStream(); 103 | 104 | VectorCollection = await VectorCollection.CreateFromMemoryAsync(stream); 105 | } 106 | else 107 | { 108 | Console.WriteLine("DB Blob does not exist"); 109 | } 110 | } 111 | 112 | 113 | public async Task InitializeAsync(string jsonFileName) 114 | { 115 | Console.WriteLine("Initializing VectorDbService & OpenAI Client"); 116 | await LoadDataFromBlobStorage(jsonFileName); 117 | string oAiApiKey = Environment.GetEnvironmentVariable("SKIT_AOAI_APIKEY") ?? "SKIT_AOAI_APIKEY not found"; 118 | string oAiEndpoint = Environment.GetEnvironmentVariable("SKIT_AOAI_ENDPOINT") ?? "SKIT_AOAI_ENDPOINT not found"; 119 | _embeddingDeploymentName = Environment.GetEnvironmentVariable("SKIT_EMBEDDING_DEPLOYMENTNAME") ?? "SKIT_EMBEDDING_DEPLOYMENTNAME not found"; 120 | 121 | AzureKeyCredential azureKeyCredential = new AzureKeyCredential(oAiApiKey); 122 | _openAIClient = new OpenAIClient(new Uri(oAiEndpoint), azureKeyCredential); 123 | Console.WriteLine("... Initialized VectorDbService & OpenAI Client !"); 124 | } 125 | } 126 | } -------------------------------------------------------------------------------- /src/VectorLibrary/Collections/VectorCollection.cs: -------------------------------------------------------------------------------- 1 | using System.Text.Json; 2 | using System.IO; 3 | 4 | namespace VectorLibrary 5 | { 6 | public class VectorCollection 7 | { 8 | private readonly int dimensions; 9 | private List objects = new List(); 10 | 11 | 12 | public async Task SaveToDiskAsync(string path) 13 | { 14 | string json = JsonSerializer.Serialize(objects, new JsonSerializerOptions { WriteIndented = true }); 15 | await File.WriteAllTextAsync(path, json); 16 | } 17 | // return the objects as a list 18 | public List GetFunctionCodePairs() 19 | { 20 | return objects; 21 | } 22 | public static async Task CreateFromMemoryAsync(Stream dataStream) 23 | { 24 | long start = DateTime.Now.Ticks; 25 | using var reader = new StreamReader(dataStream); 26 | string jsonFromStream = await reader.ReadToEndAsync(); 27 | List loadedObjects = JsonSerializer.Deserialize>(jsonFromStream) ?? new List(); 28 | 29 | var collection = new VectorCollection(1536); 30 | collection.AddRange(loadedObjects); 31 | long endtime = DateTime.Now.Ticks; 32 | Console.WriteLine($"Time to load data from memory: {(float)(endtime - start) / TimeSpan.TicksPerMillisecond} ms"); 33 | return collection; 34 | } 35 | public static async Task CreateFromDiskAsync(string path) 36 | { 37 | string jsonFromFile = await File.ReadAllTextAsync(path); 38 | List loadedObjects = JsonSerializer.Deserialize>(jsonFromFile) ?? new List(); 39 | var collection = new VectorCollection(1536); 40 | collection.AddRange(loadedObjects); 41 | return collection; 42 | } 43 | 44 | 45 | public VectorCollection(int dimensions) 46 | { 47 | this.dimensions = dimensions; 48 | } 49 | 50 | public int Dimensions => dimensions; 51 | 52 | public void Add(FunctionCodePair obj) 53 | { 54 | objects.Add(obj); 55 | } 56 | 57 | public void AddRange(IEnumerable _objects 58 | ) 59 | { 60 | objects.AddRange(_objects); 61 | } 62 | 63 | public IVector GetItem(int index) 64 | { 65 | return objects[index]; 66 | } 67 | 68 | private delegate float ComparisonStrategy(float[] vectorA, float[] vectorB); 69 | 70 | /* 71 | * This method is used to find the best match for a given query vector. 72 | * The strategy parameter is used to determine which comparison strategy to use. 73 | * The isMaxBetter parameter is used to determine whether the best match is the highest or lowest value. (future use) 74 | * The vectorSelector parameter is used to select the vector to compare against the query vector. 75 | */ 76 | private SearchResult FindBestMatch(float[] query, Func vectorSelector, ComparisonStrategy strategy) 77 | { 78 | // measure the time it take for Search 79 | long start = DateTime.Now.Ticks; 80 | float bestValue = float.MinValue; 81 | int bestIndex = 0; 82 | 83 | for (int i = 0; i < objects.Count; i++) 84 | { 85 | float currentValue = strategy(vectorSelector(objects[i]), query); 86 | if (currentValue > bestValue) 87 | { 88 | bestValue = currentValue; 89 | bestIndex = i; 90 | } 91 | } 92 | long endtime = DateTime.Now.Ticks; 93 | 94 | return new SearchResult(objects[bestIndex].GetSafeVersion(), bestValue, (float)(endtime - start) / TimeSpan.TicksPerMillisecond); 95 | } 96 | 97 | public SearchResult FindByDotProduct(float[] query, Func vectorSelector) 98 | { 99 | return FindBestMatch(query, vectorSelector, VectorMath.DotProduct); 100 | } 101 | 102 | public SearchResult FindByCosineSimilarity(float[] query, Func vectorSelector) 103 | { 104 | return FindBestMatch(query, vectorSelector, VectorMath.CosineSimilarity); 105 | } 106 | 107 | public SearchResult FindByEuclideanDistance(float[] query, Func vectorSelector) 108 | { 109 | // we negate the distance to use the isMaxBetter logic 110 | return FindBestMatch(query, vectorSelector, (a, b) => -VectorMath.EuclideanDistance(a, b)); 111 | } 112 | } 113 | } -------------------------------------------------------------------------------- /src/VectorLibrary/Interfaces/IVector.cs: -------------------------------------------------------------------------------- 1 | namespace VectorLibrary 2 | { 3 | public interface IVector 4 | { 5 | float[] GetVector(); 6 | } 7 | } -------------------------------------------------------------------------------- /src/VectorLibrary/Models/FunctionCodePair.cs: -------------------------------------------------------------------------------- 1 | 2 | 3 | namespace VectorLibrary 4 | { 5 | public class FunctionCodePair : IVector 6 | { 7 | public string Function { get; set; } 8 | public string Code { get; set; } 9 | public float[] ? FunctionVector { get; set; } 10 | public int Id { get; set; } 11 | public FunctionCodePair(int id, string function, string code) 12 | { 13 | Id = id; 14 | Function = function; 15 | Code = code; 16 | } 17 | public float[] GetVector() 18 | { 19 | return FunctionVector ?? throw new InvalidOperationException("FunctionVector is not set."); 20 | } 21 | 22 | public FunctionCodePair GetSafeVersion() 23 | { 24 | return new FunctionCodePair(Id, Function, Code); 25 | } 26 | 27 | } 28 | } -------------------------------------------------------------------------------- /src/VectorLibrary/Models/SearchResult.cs: -------------------------------------------------------------------------------- 1 | 2 | namespace VectorLibrary 3 | { 4 | public class SearchResult 5 | { 6 | public FunctionCodePair Item { get; set; } 7 | public float Value { get; set; } 8 | public float Ms { get; set; } 9 | 10 | public SearchResult(FunctionCodePair item, float value, float ms) 11 | { 12 | Item = item; 13 | Value = value; 14 | Ms = ms; 15 | } 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /src/VectorLibrary/Utils/StoreUtility.cs: -------------------------------------------------------------------------------- 1 | using Azure.Storage.Blobs; 2 | using System; 3 | using System.Collections.Generic; 4 | using System.Globalization; 5 | using System.IO; 6 | using System.Text; 7 | using System.Text.Json; 8 | using System.Threading.Tasks; 9 | using CsvHelper; 10 | using CsvHelper.Configuration; 11 | using Azure.AI.OpenAI; 12 | 13 | namespace VectorLibrary 14 | { 15 | public static class StoreUtility 16 | { 17 | 18 | public static async Task SaveFunctionCodePairsToAzureBlobAsync(List functionCodePairs, string jsonFileName) 19 | { 20 | // Retrieve connection string and container name from environment variables 21 | string connectionString = Environment.GetEnvironmentVariable("BLOB_STORAGE_CS") ?? "BLOB_STORAGE_CS not found"; 22 | string containerName = Environment.GetEnvironmentVariable("BLOB_CONTAINER_NAME") ?? "BLOB_CONTAINER_NAME not found"; 23 | if (connectionString == "BLOB_STORAGE_CS not found" || containerName == "BLOB_CONTAINER_NAME not found" || string.IsNullOrEmpty(jsonFileName)) 24 | { 25 | Console.WriteLine("One or more environment variables are not set. Please set BLOB_STORAGE_CS, BLOB_CONTAINER_NAME and jsonFileName"); 26 | throw new Exception("One or more environment variables are not set. Please set BLOB_STORAGE_CS, BLOB_CONTAINER_NAME and jsonFileName"); 27 | } 28 | 29 | // Serialize the List to JSON with indentation for human readability 30 | var json = JsonSerializer.Serialize(functionCodePairs, new JsonSerializerOptions { WriteIndented = true }); 31 | 32 | 33 | // Create a BlobServiceClient to interact with Blob storage 34 | var blobServiceClient = new BlobServiceClient(connectionString); 35 | var blobContainerClient = blobServiceClient.GetBlobContainerClient(containerName); 36 | 37 | // Get a reference to the BlobClient 38 | var blobClient = blobContainerClient.GetBlobClient(jsonFileName); 39 | 40 | // Convert JSON data to a byte array 41 | byte[] byteArray = Encoding.UTF8.GetBytes(json); 42 | using var memoryStream = new MemoryStream(byteArray); 43 | 44 | // Upload the data to the blob 45 | await blobClient.UploadAsync(memoryStream, true); 46 | Console.WriteLine("Uploaded to Blob storage: " + jsonFileName); 47 | } 48 | public static async Task> LoadFunctionCodePairsFromAzureBlobAsync(string blobFileName, OpenAIClient openAIClient, string embeddingDeploymentName) 49 | { 50 | // Retrieve connection string and container name from environment variables 51 | string connectionString = Environment.GetEnvironmentVariable("BLOB_STORAGE_CS") ?? "BLOB_STORAGE_CS not found"; 52 | string containerName = Environment.GetEnvironmentVariable("BLOB_CONTAINER_NAME") ?? "BLOB_CONTAINER_NAME not found"; 53 | if ( openAIClient==null || connectionString == "BLOB_STORAGE_CS not found" 54 | || containerName == "BLOB_CONTAINER_NAME not found" || string.IsNullOrEmpty(blobFileName) 55 | || string.IsNullOrEmpty(embeddingDeploymentName)) 56 | { 57 | Console.WriteLine("One or more environment variables are not set, or the OpenAi client is null. Please set BLOB_STORAGE_CS, BLOB_CONTAINER_NAME and blobFileName"); 58 | throw new Exception("One or more environment variables are not set. Please set BLOB_STORAGE_CS, BLOB_CONTAINER_NAME and blobFileName"); 59 | } 60 | // Create a BlobServiceClient to interact with Blob storage 61 | var blobServiceClient = new BlobServiceClient(connectionString); 62 | var blobContainerClient = blobServiceClient.GetBlobContainerClient(containerName); 63 | var blobClient = blobContainerClient.GetBlobClient(blobFileName); 64 | 65 | // Check if the blob exists 66 | if (await blobClient.ExistsAsync()) 67 | { 68 | // Download the blob's content to a MemoryStream 69 | using var memoryStream = new MemoryStream(); 70 | await blobClient.DownloadToAsync(memoryStream); 71 | memoryStream.Position = 0; // Reset the memory stream position to the beginning 72 | 73 | // Create a StreamReader from the MemoryStream 74 | using var reader = new StreamReader(memoryStream); 75 | 76 | // Call the private method to process the data from the stream 77 | var functionCodePairs = await LoadFunctionCodePairsFromStreamAsync(reader, openAIClient, embeddingDeploymentName); 78 | 79 | return functionCodePairs; 80 | } 81 | else 82 | { 83 | throw new FileNotFoundException("Blob not found: " + blobFileName); 84 | } 85 | } 86 | 87 | public static async Task> LoadFunctionCodePairsFromCsvAsync(string csvFilePath, OpenAIClient openAIClient, string embeddingDeploymentName) 88 | { 89 | // Create a StreamReader from the file path 90 | using var fileStream = new FileStream(csvFilePath, FileMode.Open, FileAccess.Read); 91 | using var reader = new StreamReader(fileStream); 92 | 93 | // Call the private method to process the data from the stream 94 | var functionCodePairs = await LoadFunctionCodePairsFromStreamAsync(reader, openAIClient, embeddingDeploymentName); 95 | 96 | return functionCodePairs; 97 | } 98 | private static async Task> LoadFunctionCodePairsFromStreamAsync(StreamReader streamReader, OpenAIClient openAIClient, string embeddingDeploymentName) 99 | { 100 | var rows = new List(); 101 | var config = new CsvConfiguration(CultureInfo.InvariantCulture) { HasHeaderRecord = true }; 102 | EmbeddingsOptions embeddingsOptions; 103 | 104 | // Use the provided StreamReader 105 | using var csv = new CsvReader(streamReader, config); 106 | csv.Read(); 107 | csv.ReadHeader(); 108 | while (csv.Read()) 109 | { 110 | string function = csv.GetField("FunctionDescription") ?? string.Empty; 111 | string code = csv.GetField("FunctionCode") ?? string.Empty; 112 | var record = new FunctionCodePair(csv.GetField("id"), function, code); 113 | embeddingsOptions = new EmbeddingsOptions(embeddingDeploymentName, new List { record.Function }); 114 | var embeddingsResponse = await openAIClient.GetEmbeddingsAsync(embeddingsOptions); 115 | record.FunctionVector = embeddingsResponse.Value.Data[0].Embedding.ToArray(); 116 | rows.Add(record); 117 | } 118 | 119 | return rows; 120 | } 121 | 122 | public static async Task SaveVectorCollectionAsync(VectorCollection collection, string jsonFilePath) 123 | { 124 | await collection.SaveToDiskAsync(jsonFilePath); 125 | } 126 | 127 | public static async Task LoadVectorCollectionAsync(string jsonFilePath) 128 | { 129 | return await VectorCollection.CreateFromDiskAsync(jsonFilePath); 130 | } 131 | } 132 | } -------------------------------------------------------------------------------- /src/VectorLibrary/Utils/VectorMath.cs: -------------------------------------------------------------------------------- 1 | namespace VectorLibrary 2 | { 3 | public class VectorMath 4 | { 5 | private const int VectorDimension = 1536; 6 | 7 | public static float Length(float[] vector) 8 | { 9 | float sum = 0; 10 | for (int i = 0; i < VectorDimension ; i++) 11 | { 12 | sum += vector[i] * vector[i]; 13 | } 14 | return (float)Math.Sqrt(sum); 15 | } 16 | public static float DotProduct(float[] a, float[] b) 17 | { 18 | float sum = 0; 19 | for (int i = 0; i < VectorDimension; i++) 20 | { 21 | sum += a[i] * b[i]; 22 | } 23 | 24 | return sum; 25 | } 26 | public static float CosineSimilarity(float[] a, float[] b) 27 | { 28 | float dotProduct = DotProduct(a, b); 29 | return dotProduct / (float)Math.Pow(VectorDimension,2); 30 | } 31 | 32 | public static float EuclideanDistance(float[] a, float[] b) 33 | { 34 | float sum = 0; 35 | for (int i = 0; i < VectorDimension; i++) 36 | { 37 | sum += (a[i] - b[i]) * (a[i] - b[i]); 38 | } 39 | return (float)Math.Sqrt(sum); 40 | } 41 | } 42 | } -------------------------------------------------------------------------------- /tests/test.rest: -------------------------------------------------------------------------------- 1 | 2 | curl -X 'GET' \ 3 | 'http://localhost:8080/search/cosine?query=i%20am%20seeking%20to%20find%20my%20team%20commision%20data' \ 4 | -H 'accept: */*' --------------------------------------------------------------------------------