├── .github └── workflows │ ├── build_and_test_mlx.yml │ ├── build_and_test_vectura.yml │ └── update-readme.yml ├── .gitignore ├── .swiftpm └── xcode │ ├── package.xcworkspace │ └── contents.xcworkspacedata │ └── xcshareddata │ └── xcschemes │ ├── VecturaKit-Package.xcscheme │ ├── VecturaKit.xcscheme │ ├── VecturaKitTests.xcscheme │ ├── VecturaMLXKit.xcscheme │ ├── VecturaMLXKitTests.xcscheme │ ├── vectura-cli.xcscheme │ └── vectura-mlx-cli.xcscheme ├── .vscode ├── launch.json └── settings.json ├── LICENSE ├── Package.resolved ├── Package.swift ├── README.md ├── Sources ├── VecturaCLI │ └── VecturaCLI.swift ├── VecturaKit │ ├── BM25Index.swift │ ├── FileStorageProvider.swift │ ├── VecturaConfig.swift │ ├── VecturaDocument.swift │ ├── VecturaError.swift │ ├── VecturaKit.swift │ ├── VecturaModelSource.swift │ ├── VecturaProtocol.swift │ ├── VecturaSearchResult.swift │ └── VecturaStorage.swift ├── VecturaMLXCLI │ └── VecturaMLXCLI.swift └── VecturaMLXKit │ ├── MLXEmbedder.swift │ └── VecturaMLXKit.swift ├── Tests ├── VecturaKitTests │ └── VecturaKitTests.swift └── VecturaMLXKitTests │ └── VecturaMLXKitTests.swift └── scripts └── update_readme.py /.github/workflows/build_and_test_mlx.yml: -------------------------------------------------------------------------------- 1 | name: "VecturaMLX CI" 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | 9 | jobs: 10 | build-and-test: 11 | runs-on: macos-15 12 | env: 13 | DEVELOPER_DIR: "/Applications/Xcode_16.1.app/Contents/Developer" 14 | steps: 15 | - name: Checkout code 16 | uses: actions/checkout@v4 17 | - name: Setup Swift Toolchain 18 | uses: swift-actions/setup-swift@v2 19 | with: 20 | swift-version: "6.0" 21 | - name: Build target VecturaMLXKit 22 | run: xcodebuild -scheme "VecturaMLXKit" build -destination 'platform=macOS' 23 | - name: Run tests 24 | run: xcodebuild -scheme "VecturaMLXKitTests" test -destination 'platform=macOS' -------------------------------------------------------------------------------- /.github/workflows/build_and_test_vectura.yml: -------------------------------------------------------------------------------- 1 | name: "VecturaKit CI" 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | 9 | jobs: 10 | build-and-test: 11 | runs-on: macos-15 12 | env: 13 | DEVELOPER_DIR: "/Applications/Xcode_16.1.app/Contents/Developer" 14 | strategy: 15 | matrix: 16 | target: ["VecturaKit", "vectura-cli"] 17 | steps: 18 | - uses: actions/checkout@v4 19 | - name: Build ${{ matrix.target }} 20 | run: swift build --product "${{ matrix.target }}" 21 | - name: Test ${{ matrix.target }} 22 | run: swift test --filter "${{ matrix.target }}Tests" -------------------------------------------------------------------------------- /.github/workflows/update-readme.yml: -------------------------------------------------------------------------------- 1 | name: Update README 2 | 3 | on: 4 | push: 5 | # Run on pushes to main (or any branch you prefer) 6 | branches: [main] 7 | 8 | permissions: 9 | contents: write 10 | pull-requests: write 11 | 12 | jobs: 13 | update-readme: 14 | # Skip this job if the commit message indicates a merge from docs/update-readme. 15 | # Adjust the string in the contains() check if your merge commit message is different. 16 | if: "!contains(github.event.head_commit.message, 'docs/update-readme')" 17 | runs-on: ubuntu-latest 18 | 19 | steps: 20 | # 1. Check out the repository 21 | - name: Checkout repository 22 | uses: actions/checkout@v4 23 | 24 | # 2. Set up Python (make sure you choose a suitable version) 25 | - name: Set up Python 26 | uses: actions/setup-python@v5 27 | with: 28 | python-version: '3.x' 29 | 30 | # 3. Install Gemini package 31 | - name: Install Gemini package 32 | run: pip install google-genai 33 | 34 | # 4. Run the Python script that calls GeminiAI 35 | - name: Run README updater script 36 | env: 37 | GEMINI_API_KEY: "${{ secrets.GEMINI_API_KEY }}" 38 | run: python3 scripts/update_readme.py 39 | 40 | # 5. Open a Pull Request using an action (this one automates creating a PR) 41 | - name: Create Pull Request 42 | uses: peter-evans/create-pull-request@v4 43 | with: 44 | token: ${{ secrets.GITHUB_TOKEN }} 45 | commit-message: "docs: update README.md based on codebase" 46 | title: "docs: update README.md" 47 | body: | 48 | This PR updates the README.md file based on the current codebase using Gemini AI. 49 | 50 | - Automatically generated by GitHub Actions 51 | branch: docs/update-readme 52 | base: main 53 | delete-branch: true 54 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Xcode 2 | # 3 | # gitignore contributors: remember to update Global/Xcode.gitignore, Objective-C.gitignore & Swift.gitignore 4 | 5 | ## User settings 6 | xcuserdata/ 7 | 8 | ## Obj-C/Swift specific 9 | *.hmap 10 | 11 | ## App packaging 12 | *.ipa 13 | *.dSYM.zip 14 | *.dSYM 15 | 16 | ## Playgrounds 17 | timeline.xctimeline 18 | playground.xcworkspace 19 | 20 | # Swift Package Manager 21 | # 22 | # Add this line if you want to avoid checking in source code from Swift Package Manager dependencies. 23 | # Packages/ 24 | # Package.pins 25 | # Package.resolved 26 | # *.xcodeproj 27 | # 28 | # Xcode automatically generates this directory with a .xcworkspacedata file and xcuserdata 29 | # hence it is not needed unless you have added a package configuration file to your project 30 | # .swiftpm 31 | 32 | .build/ 33 | 34 | # CocoaPods 35 | # 36 | # We recommend against adding the Pods directory to your .gitignore. However 37 | # you should judge for yourself, the pros and cons are mentioned at: 38 | # https://guides.cocoapods.org/using/using-cocoapods.html#should-i-check-the-pods-directory-into-source-control 39 | # 40 | # Pods/ 41 | # 42 | # Add this line if you want to avoid checking in source code from the Xcode workspace 43 | # *.xcworkspace 44 | 45 | # Carthage 46 | # 47 | # Add this line if you want to avoid checking in source code from Carthage dependencies. 48 | # Carthage/Checkouts 49 | 50 | Carthage/Build/ 51 | 52 | # fastlane 53 | # 54 | # It is recommended to not store the screenshots in the git repo. 55 | # Instead, use fastlane to re-generate the screenshots whenever they are needed. 56 | # For more information about the recommended setup visit: 57 | # https://docs.fastlane.tools/best-practices/source-control/#source-control 58 | 59 | fastlane/report.xml 60 | fastlane/Preview.html 61 | fastlane/screenshots/**/*.png 62 | fastlane/test_output 63 | -------------------------------------------------------------------------------- /.swiftpm/xcode/package.xcworkspace/contents.xcworkspacedata: -------------------------------------------------------------------------------- 1 | 2 | 4 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.swiftpm/xcode/xcshareddata/xcschemes/VecturaKit-Package.xcscheme: -------------------------------------------------------------------------------- 1 | 2 | 5 | 9 | 10 | 16 | 22 | 23 | 24 | 30 | 36 | 37 | 38 | 44 | 50 | 51 | 52 | 58 | 64 | 65 | 66 | 67 | 68 | 74 | 75 | 77 | 83 | 84 | 85 | 87 | 93 | 94 | 95 | 96 | 97 | 107 | 108 | 114 | 115 | 116 | 117 | 123 | 124 | 130 | 131 | 132 | 133 | 135 | 136 | 139 | 140 | 141 | -------------------------------------------------------------------------------- /.swiftpm/xcode/xcshareddata/xcschemes/VecturaKit.xcscheme: -------------------------------------------------------------------------------- 1 | 2 | 5 | 9 | 10 | 16 | 22 | 23 | 24 | 25 | 26 | 32 | 33 | 35 | 41 | 42 | 43 | 44 | 45 | 55 | 56 | 62 | 63 | 69 | 70 | 71 | 72 | 74 | 75 | 78 | 79 | 80 | -------------------------------------------------------------------------------- /.swiftpm/xcode/xcshareddata/xcschemes/VecturaKitTests.xcscheme: -------------------------------------------------------------------------------- 1 | 2 | 5 | 9 | 10 | 16 | 17 | 19 | 25 | 26 | 27 | 28 | 29 | 39 | 40 | 46 | 47 | 49 | 50 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /.swiftpm/xcode/xcshareddata/xcschemes/VecturaMLXKit.xcscheme: -------------------------------------------------------------------------------- 1 | 2 | 5 | 9 | 10 | 16 | 22 | 23 | 24 | 25 | 26 | 32 | 33 | 43 | 44 | 50 | 51 | 57 | 58 | 59 | 60 | 62 | 63 | 66 | 67 | 68 | -------------------------------------------------------------------------------- /.swiftpm/xcode/xcshareddata/xcschemes/VecturaMLXKitTests.xcscheme: -------------------------------------------------------------------------------- 1 | 2 | 5 | 9 | 10 | 16 | 17 | 19 | 25 | 26 | 27 | 28 | 29 | 39 | 40 | 46 | 47 | 49 | 50 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /.swiftpm/xcode/xcshareddata/xcschemes/vectura-cli.xcscheme: -------------------------------------------------------------------------------- 1 | 2 | 5 | 9 | 10 | 16 | 22 | 23 | 24 | 25 | 26 | 32 | 33 | 43 | 45 | 51 | 52 | 53 | 54 | 60 | 62 | 68 | 69 | 70 | 71 | 73 | 74 | 77 | 78 | 79 | -------------------------------------------------------------------------------- /.swiftpm/xcode/xcshareddata/xcschemes/vectura-mlx-cli.xcscheme: -------------------------------------------------------------------------------- 1 | 2 | 5 | 9 | 10 | 16 | 22 | 23 | 24 | 25 | 26 | 32 | 33 | 43 | 45 | 51 | 52 | 53 | 54 | 57 | 58 | 59 | 60 | 66 | 68 | 74 | 75 | 76 | 77 | 79 | 80 | 83 | 84 | 85 | -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | "configurations": [ 3 | { 4 | "type": "swift", 5 | "request": "launch", 6 | "args": [], 7 | "cwd": "${workspaceFolder:VecturaKit}", 8 | "name": "Debug vectura-cli", 9 | "program": "${workspaceFolder:VecturaKit}/.build/debug/vectura-cli", 10 | "preLaunchTask": "swift: Build Debug vectura-cli" 11 | }, 12 | { 13 | "type": "swift", 14 | "request": "launch", 15 | "args": [], 16 | "cwd": "${workspaceFolder:VecturaKit}", 17 | "name": "Release vectura-cli", 18 | "program": "${workspaceFolder:VecturaKit}/.build/release/vectura-cli", 19 | "preLaunchTask": "swift: Build Release vectura-cli" 20 | }, 21 | { 22 | "type": "swift", 23 | "request": "launch", 24 | "args": [], 25 | "cwd": "${workspaceFolder:VecturaKit}", 26 | "name": "Debug vectura-mlx-cli", 27 | "program": "${workspaceFolder:VecturaKit}/.build/debug/vectura-mlx-cli", 28 | "preLaunchTask": "swift: Build Debug vectura-mlx-cli" 29 | }, 30 | { 31 | "type": "swift", 32 | "request": "launch", 33 | "args": [], 34 | "cwd": "${workspaceFolder:VecturaKit}", 35 | "name": "Release vectura-mlx-cli", 36 | "program": "${workspaceFolder:VecturaKit}/.build/release/vectura-mlx-cli", 37 | "preLaunchTask": "swift: Build Release vectura-mlx-cli" 38 | } 39 | ] 40 | } -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | {} -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Rudrank Riyam 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Package.resolved: -------------------------------------------------------------------------------- 1 | { 2 | "originHash" : "6db3b74627697ac586e400fabd67629791185e08ed51fb68cf82ab175e2330c2", 3 | "pins" : [ 4 | { 5 | "identity" : "gzipswift", 6 | "kind" : "remoteSourceControl", 7 | "location" : "https://github.com/1024jp/GzipSwift", 8 | "state" : { 9 | "revision" : "731037f6cc2be2ec01562f6597c1d0aa3fe6fd05", 10 | "version" : "6.0.1" 11 | } 12 | }, 13 | { 14 | "identity" : "jinja", 15 | "kind" : "remoteSourceControl", 16 | "location" : "https://github.com/johnmai-dev/Jinja", 17 | "state" : { 18 | "revision" : "bbddb92fc51ae420b87300298370fd1dfc308f73", 19 | "version" : "1.1.1" 20 | } 21 | }, 22 | { 23 | "identity" : "mlx-swift", 24 | "kind" : "remoteSourceControl", 25 | "location" : "https://github.com/ml-explore/mlx-swift", 26 | "state" : { 27 | "revision" : "70dbb62128a5a1471a5ab80363430adb33470cab", 28 | "version" : "0.21.2" 29 | } 30 | }, 31 | { 32 | "identity" : "mlx-swift-examples", 33 | "kind" : "remoteSourceControl", 34 | "location" : "https://github.com/ml-explore/mlx-swift-examples/", 35 | "state" : { 36 | "branch" : "main", 37 | "revision" : "cb66b4bc6bc1a69663837881e7f1260cd49d6b59" 38 | } 39 | }, 40 | { 41 | "identity" : "swift-argument-parser", 42 | "kind" : "remoteSourceControl", 43 | "location" : "https://github.com/apple/swift-argument-parser.git", 44 | "state" : { 45 | "revision" : "0fbc8848e389af3bb55c182bc19ca9d5dc2f255b", 46 | "version" : "1.4.0" 47 | } 48 | }, 49 | { 50 | "identity" : "swift-collections", 51 | "kind" : "remoteSourceControl", 52 | "location" : "https://github.com/apple/swift-collections.git", 53 | "state" : { 54 | "revision" : "671108c96644956dddcd89dd59c203dcdb36cec7", 55 | "version" : "1.1.4" 56 | } 57 | }, 58 | { 59 | "identity" : "swift-embeddings", 60 | "kind" : "remoteSourceControl", 61 | "location" : "https://github.com/jkrukowski/swift-embeddings.git", 62 | "state" : { 63 | "revision" : "419c52ea50238435218c587e3bebfe290ee91287", 64 | "version" : "0.0.13" 65 | } 66 | }, 67 | { 68 | "identity" : "swift-numerics", 69 | "kind" : "remoteSourceControl", 70 | "location" : "https://github.com/apple/swift-numerics", 71 | "state" : { 72 | "revision" : "0a5bc04095a675662cf24757cc0640aa2204253b", 73 | "version" : "1.0.2" 74 | } 75 | }, 76 | { 77 | "identity" : "swift-safetensors", 78 | "kind" : "remoteSourceControl", 79 | "location" : "https://github.com/jkrukowski/swift-safetensors.git", 80 | "state" : { 81 | "revision" : "718b0f38f912e0bf9d92130fa1e1fe2ae5136dd6", 82 | "version" : "0.0.7" 83 | } 84 | }, 85 | { 86 | "identity" : "swift-sentencepiece", 87 | "kind" : "remoteSourceControl", 88 | "location" : "https://github.com/jkrukowski/swift-sentencepiece", 89 | "state" : { 90 | "revision" : "36a8b2b45733f6adb3092100f16e4c7d38a10a7c", 91 | "version" : "0.0.6" 92 | } 93 | }, 94 | { 95 | "identity" : "swift-transformers", 96 | "kind" : "remoteSourceControl", 97 | "location" : "https://github.com/huggingface/swift-transformers", 98 | "state" : { 99 | "revision" : "be855fac725dbae27264e47a3eb535cc422a4ba8", 100 | "version" : "0.1.18" 101 | } 102 | } 103 | ], 104 | "version" : 3 105 | } 106 | -------------------------------------------------------------------------------- /Package.swift: -------------------------------------------------------------------------------- 1 | // swift-tools-version: 6.0 2 | // The swift-tools-version declares the minimum version of Swift required to build this package. 3 | 4 | import PackageDescription 5 | 6 | let package = Package( 7 | name: "VecturaKit", 8 | platforms: [ 9 | .macOS(.v14), 10 | .iOS(.v17), 11 | .tvOS(.v17), 12 | .visionOS(.v1), 13 | .watchOS(.v10), 14 | ], 15 | products: [ 16 | .library( 17 | name: "VecturaKit", 18 | targets: ["VecturaKit"] 19 | ), 20 | .library( 21 | name: "VecturaMLXKit", 22 | targets: ["VecturaMLXKit"] 23 | ), 24 | .executable( 25 | name: "vectura-cli", 26 | targets: ["VecturaCLI"] 27 | ), 28 | .executable( 29 | name: "vectura-mlx-cli", 30 | targets: ["VecturaMLXCLI"] 31 | ), 32 | ], 33 | dependencies: [ 34 | .package(url: "https://github.com/jkrukowski/swift-embeddings.git", from: "0.0.10"), 35 | .package(url: "https://github.com/apple/swift-argument-parser.git", from: "1.4.0"), 36 | .package(url: "https://github.com/ml-explore/mlx-swift-examples/", branch: "main"), 37 | ], 38 | targets: [ 39 | .target( 40 | name: "VecturaKit", 41 | dependencies: [ 42 | .product(name: "Embeddings", package: "swift-embeddings") 43 | ], 44 | cSettings: [ 45 | .define("ACCELERATE_NEW_LAPACK"), 46 | .define("ACCELERATE_LAPACK_ILP64"), 47 | ] 48 | ), 49 | .target( 50 | name: "VecturaMLXKit", 51 | dependencies: [ 52 | "VecturaKit", 53 | .product(name: "MLXEmbedders", package: "mlx-swift-examples"), 54 | ] 55 | ), 56 | .executableTarget( 57 | name: "VecturaCLI", 58 | dependencies: [ 59 | "VecturaKit", 60 | .product(name: "ArgumentParser", package: "swift-argument-parser"), 61 | ] 62 | ), 63 | .executableTarget( 64 | name: "VecturaMLXCLI", 65 | dependencies: [ 66 | "VecturaMLXKit", 67 | .product(name: "ArgumentParser", package: "swift-argument-parser"), 68 | ] 69 | ), 70 | .testTarget( 71 | name: "VecturaKitTests", 72 | dependencies: ["VecturaKit"] 73 | ), 74 | .testTarget( 75 | name: "VecturaMLXKitTests", 76 | dependencies: ["VecturaMLXKit"] 77 | ), 78 | ] 79 | ) 80 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # VecturaKit 2 | 3 | VecturaKit is a Swift-based vector database designed for on-device applications, enabling advanced user experiences through local vector storage and retrieval. Inspired by [Dripfarm's SVDB](https://github.com/Dripfarm/SVDB), **VecturaKit** utilizes `MLTensor` and [`swift-embeddings`](https://github.com/jkrukowski/swift-embeddings) for generating and managing embeddings. The framework offers two primary modules: `VecturaKit`, which supports diverse embedding models via `swift-embeddings`, and `VecturaMLXKit`, which leverages Apple's MLX framework for accelerated processing. 4 | 5 | ## Support 6 | 7 | Love this project? Check out my books to explore more of AI and iOS development: 8 | - [Exploring AI for iOS Development](https://academy.rudrank.com/product/ai) 9 | - [Exploring AI-Assisted Coding for iOS Development](https://academy.rudrank.com/product/ai-assisted-coding) 10 | 11 | Your support helps to keep this project growing! 12 | 13 | ## Key Features 14 | 15 | - **On-Device Storage:** Stores and manages vector embeddings locally, enhancing privacy and reducing latency. 16 | - **Hybrid Search:** Combines vector similarity with BM25 text search for comprehensive and relevant search results (`VecturaKit`). 17 | - **Batch Processing:** Indexes documents in parallel for faster data ingestion. 18 | - **Persistent Storage:** Automatically saves and loads document data, preserving the database state across app sessions. 19 | - **Configurable Search:** Customizes search behavior with adjustable thresholds, result limits, and hybrid search weights. 20 | - **Custom Storage Location:** Specifies a custom directory for database storage. 21 | - **MLX Support:** Employs Apple's MLX framework for accelerated embedding generation and search operations (`VecturaMLXKit`). 22 | - **CLI Tool:** Includes a command-line interface (CLI) for database management, testing, and debugging for both `VecturaKit` and `VecturaMLXKit`. 23 | 24 | ## Supported Platforms 25 | 26 | - macOS 14.0 or later 27 | - iOS 17.0 or later 28 | - tvOS 17.0 or later 29 | - visionOS 1.0 or later 30 | - watchOS 10.0 or later 31 | 32 | ## Installation 33 | 34 | ### Swift Package Manager 35 | 36 | To integrate VecturaKit into your project using Swift Package Manager, add the following dependency in your `Package.swift` file: 37 | 38 | ```swift 39 | dependencies: [ 40 | .package(url: "https://github.com/rryam/VecturaKit.git", branch: "main"), 41 | ], 42 | ``` 43 | 44 | ### Dependencies 45 | 46 | VecturaKit relies on the following Swift packages: 47 | 48 | - [swift-embeddings](https://github.com/jkrukowski/swift-embeddings): Used in `VecturaKit` for generating text embeddings using various models. 49 | - [swift-argument-parser](https://github.com/apple/swift-argument-parser): Used for creating the command-line interface. 50 | - [mlx-swift-examples](https://github.com/ml-explore/mlx-swift-examples): Provides MLX-based embeddings and vector search capabilities, specifically for `VecturaMLXKit`. 51 | 52 | ## Usage 53 | 54 | ### Core VecturaKit 55 | 56 | 1. **Import VecturaKit** 57 | 58 | ```swift 59 | import VecturaKit 60 | ``` 61 | 62 | 2. **Create Configuration and Initialize Database** 63 | 64 | ```swift 65 | import Foundation 66 | import VecturaKit 67 | 68 | let config = VecturaConfig( 69 | name: "my-vector-db", 70 | directoryURL: nil, // Optional custom storage location 71 | dimension: 384, // Matches the default BERT model dimension 72 | searchOptions: VecturaConfig.SearchOptions( 73 | defaultNumResults: 10, 74 | minThreshold: 0.7, 75 | hybridWeight: 0.5, // Balance between vector and text search 76 | k1: 1.2, // BM25 parameters 77 | b: 0.75 78 | ) 79 | ) 80 | 81 | let vectorDB = try await VecturaKit(config: config) 82 | ``` 83 | 84 | 3. **Add Documents** 85 | 86 | Single document: 87 | 88 | ```swift 89 | let text = "Sample text to be embedded" 90 | let documentId = try await vectorDB.addDocument( 91 | text: text, 92 | id: UUID(), // Optional, will be generated if not provided 93 | model: .id("sentence-transformers/all-MiniLM-L6-v2") // Optional, this is the default 94 | ) 95 | ``` 96 | 97 | Multiple documents in batch: 98 | 99 | ```swift 100 | let texts = [ 101 | "First document text", 102 | "Second document text", 103 | "Third document text" 104 | ] 105 | let documentIds = try await vectorDB.addDocuments( 106 | texts: texts, 107 | ids: nil, // Optional array of UUIDs 108 | model: .id("sentence-transformers/all-MiniLM-L6-v2") // Optional model 109 | ) 110 | ``` 111 | 112 | 4. **Search Documents** 113 | 114 | Search by text (hybrid search): 115 | 116 | ```swift 117 | let results = try await vectorDB.search( 118 | query: "search query", 119 | numResults: 5, // Optional 120 | threshold: 0.8, // Optional 121 | model: .id("sentence-transformers/all-MiniLM-L6-v2") // Optional 122 | ) 123 | 124 | for result in results { 125 | print("Document ID: \(result.id)") 126 | print("Text: \(result.text)") 127 | print("Similarity Score: \(result.score)") 128 | print("Created At: \(result.createdAt)") 129 | } 130 | ``` 131 | 132 | Search by vector embedding: 133 | 134 | ```swift 135 | let results = try await vectorDB.search( 136 | query: embeddingArray, // [Float] matching config.dimension 137 | numResults: 5, // Optional 138 | threshold: 0.8 // Optional 139 | ) 140 | ``` 141 | 142 | 5. **Document Management** 143 | 144 | Update document: 145 | 146 | ```swift 147 | try await vectorDB.updateDocument( 148 | id: documentId, 149 | newText: "Updated text", 150 | model: .id("sentence-transformers/all-MiniLM-L6-v2") // Optional 151 | ) 152 | ``` 153 | 154 | Delete documents: 155 | 156 | ```swift 157 | try await vectorDB.deleteDocuments(ids: [documentId1, documentId2]) 158 | ``` 159 | 160 | Reset database: 161 | 162 | ```swift 163 | try await vectorDB.reset() 164 | ``` 165 | 166 | ### VecturaMLXKit (MLX Version) 167 | 168 | VecturaMLXKit harnesses Apple's MLX framework for accelerated processing, delivering optimized performance for on-device machine learning tasks. 169 | 170 | 1. **Import VecturaMLXKit** 171 | 172 | ```swift 173 | import VecturaMLXKit 174 | ``` 175 | 176 | 2. **Initialize Database** 177 | 178 | ```swift 179 | import VecturaMLXKit 180 | import MLXEmbedders 181 | 182 | let config = VecturaConfig( 183 | name: "my-mlx-vector-db", 184 | dimension: 768 // nomic_text_v1_5 model outputs 768-dimensional embeddings 185 | ) 186 | let vectorDB = try await VecturaMLXKit(config: config, modelConfiguration: .nomic_text_v1_5) 187 | ``` 188 | 189 | 3. **Add Documents** 190 | 191 | ```swift 192 | let texts = [ 193 | "First document text", 194 | "Second document text", 195 | "Third document text" 196 | ] 197 | let documentIds = try await vectorDB.addDocuments(texts: texts) 198 | ``` 199 | 200 | 4. **Search Documents** 201 | 202 | ```swift 203 | let results = try await vectorDB.search( 204 | query: "search query", 205 | numResults: 5, // Optional 206 | threshold: 0.8 // Optional 207 | ) 208 | 209 | for result in results { 210 | print("Document ID: \(result.id)") 211 | print("Text: \(result.text)") 212 | print("Similarity Score: \(result.score)") 213 | print("Created At: \(result.createdAt)") 214 | } 215 | ``` 216 | 217 | 5. **Document Management** 218 | 219 | Update document: 220 | 221 | ```swift 222 | try await vectorDB.updateDocument( 223 | id: documentId, 224 | newText: "Updated text" 225 | ) 226 | ``` 227 | 228 | Delete documents: 229 | 230 | ```swift 231 | try await vectorDB.deleteDocuments(ids: [documentId1, documentId2]) 232 | ``` 233 | 234 | Reset database: 235 | 236 | ```swift 237 | try await vectorDB.reset() 238 | ``` 239 | 240 | ## Command Line Interface 241 | 242 | VecturaKit includes a command-line interface for both the standard and MLX versions, facilitating easy database management. 243 | 244 | **Standard CLI Tool** 245 | 246 | ```bash 247 | # Add documents 248 | vectura add "First document" "Second document" "Third document" \ 249 | --db-name "my-vector-db" \ 250 | --dimension 384 \ 251 | --model-id "sentence-transformers/all-MiniLM-L6-v2" 252 | 253 | # Search documents 254 | vectura search "search query" \ 255 | --db-name "my-vector-db" \ 256 | --dimension 384 \ 257 | --threshold 0.7 \ 258 | --num-results 5 \ 259 | --model-id "sentence-transformers/all-MiniLM-L6-v2" 260 | 261 | # Update document 262 | vectura update "Updated text content" \ 263 | --db-name "my-vector-db" \ 264 | --dimension 384 \ 265 | --model-id "sentence-transformers/all-MiniLM-L6-v2" 266 | 267 | # Delete documents 268 | vectura delete \ 269 | --db-name "my-vector-db" \ 270 | --dimension 384 271 | 272 | # Reset database 273 | vectura reset \ 274 | --db-name "my-vector-db" \ 275 | --dimension 384 276 | 277 | # Run demo with sample data 278 | vectura mock \ 279 | --db-name "my-vector-db" \ 280 | --dimension 384 \ 281 | --threshold 0.7 \ 282 | --num-results 10 \ 283 | --model-id "sentence-transformers/all-MiniLM-L6-v2" 284 | ``` 285 | 286 | Common options: 287 | 288 | - `--db-name, -d`: Database name (default: "vectura-cli-db") 289 | - `--dimension, -v`: Vector dimension (default: 384) 290 | - `--threshold, -t`: Minimum similarity threshold (default: 0.7) 291 | - `--num-results, -n`: Number of results to return (default: 10) 292 | - `--model-id, -m`: Model ID for embeddings (default: "sentence-transformers/all-MiniLM-L6-v2") 293 | 294 | **MLX CLI Tool** 295 | 296 | ```bash 297 | # Add documents 298 | vectura-mlx add "First document" "Second document" "Third document" --db-name "my-mlx-vector-db" 299 | 300 | # Search documents 301 | vectura-mlx search "search query" --db-name "my-mlx-vector-db" --threshold 0.7 --num-results 5 302 | 303 | # Update document 304 | vectura-mlx update "Updated text content" --db-name "my-mlx-vector-db" 305 | 306 | # Delete documents 307 | vectura-mlx delete --db-name "my-mlx-vector-db" 308 | 309 | # Reset database 310 | vectura-mlx reset --db-name "my-mlx-vector-db" 311 | 312 | # Run demo with sample data 313 | vectura-mlx mock --db-name "my-mlx-vector-db" 314 | ``` 315 | 316 | ## License 317 | 318 | VecturaKit is released under the MIT License. See the [LICENSE](LICENSE) file for more information. Copyright (c) 2025 Rudrank Riyam. 319 | 320 | ## Contributing 321 | 322 | Contributions are welcome! Please fork the repository and submit a pull request with your improvements. 323 | 324 | ### Development 325 | 326 | The project is structured as a Swift Package. It includes the following key targets: 327 | 328 | - `VecturaKit`: The core vector database library. 329 | - `VecturaMLXKit`: The MLX-accelerated version of the library. 330 | - `vectura-cli`: The command-line interface for `VecturaKit`. 331 | - `vectura-mlx-cli`: The command-line interface for `VecturaMLXKit`. 332 | 333 | To build and test the project, use the following commands: 334 | 335 | ```bash 336 | swift build 337 | swift test 338 | ``` 339 | 340 | The project also includes CI workflows defined in `.github/workflows` to automate building and testing on pull requests and pushes to the `main` branch. The workflows require Xcode 16.1 and Swift 6.0. 341 | 342 | Debugging configurations are provided in `.vscode/launch.json` for the `vectura-cli`. These can be used to launch the CLI with the debugger attached. 343 | 344 | ### Continuous Integration 345 | 346 | The project uses GitHub Actions for continuous integration. The following workflows are defined: 347 | 348 | - `.github/workflows/build_and_test_mlx.yml`: Builds and tests the `VecturaMLXKit` target. 349 | - `.github/workflows/build_and_test_vectura.yml`: Builds and tests the `VecturaKit` and `vectura-cli` targets. 350 | - `.github/workflows/update-readme.yml`: Automatically updates the `README.md` file using a Python script that calls the Gemini AI model. This workflow is triggered on pushes to the `main` branch and creates a pull request with the updated README. 351 | -------------------------------------------------------------------------------- /Sources/VecturaCLI/VecturaCLI.swift: -------------------------------------------------------------------------------- 1 | import ArgumentParser 2 | import Foundation 3 | import VecturaKit 4 | 5 | @available(macOS 15.0, iOS 18.0, tvOS 18.0, visionOS 2.0, watchOS 11.0, *) 6 | @main 7 | struct VecturaCLI: AsyncParsableCommand { 8 | struct DocumentID: ExpressibleByArgument, Decodable { 9 | let uuid: UUID 10 | 11 | init(_ uuid: UUID) { 12 | self.uuid = uuid 13 | } 14 | 15 | init?(argument: String) { 16 | guard let uuid = UUID(uuidString: argument) else { return nil } 17 | self.uuid = uuid 18 | } 19 | } 20 | 21 | static let configuration = CommandConfiguration( 22 | commandName: "vectura", 23 | abstract: "A CLI tool for VecturaKit vector database", 24 | subcommands: [Add.self, Search.self, Update.self, Delete.self, Reset.self, Mock.self] 25 | ) 26 | 27 | static func setupDB(dbName: String, dimension: Int, numResults: Int, threshold: Float) async throws 28 | -> VecturaKit 29 | { 30 | let config = VecturaConfig( 31 | name: dbName, 32 | dimension: dimension, 33 | searchOptions: VecturaConfig.SearchOptions( 34 | defaultNumResults: numResults, 35 | minThreshold: threshold 36 | ) 37 | ) 38 | return try await VecturaKit(config: config) 39 | } 40 | } 41 | 42 | @available(macOS 15.0, iOS 18.0, tvOS 18.0, visionOS 2.0, watchOS 11.0, *) 43 | extension VecturaCLI { 44 | struct Mock: AsyncParsableCommand { 45 | static let configuration = CommandConfiguration( 46 | abstract: "Run a mock demonstration with sample data" 47 | ) 48 | 49 | @Option(name: [.long, .customShort("d")], help: "Database name") 50 | var dbName: String = "vectura-cli-db" 51 | 52 | @Option(name: [.long, .customShort("v")], help: "Vector dimension") 53 | var dimension: Int = 384 54 | 55 | @Option(name: [.long, .customShort("t")], help: "Minimum similarity threshold") 56 | var threshold: Float = 0.7 57 | 58 | @Option(name: [.long, .customShort("n")], help: "Number of results to return") 59 | var numResults: Int = 10 60 | 61 | @Option(name: [.long, .customShort("m")], help: "Model ID for embeddings") 62 | var modelId: String = "sentence-transformers/all-MiniLM-L6-v2" 63 | 64 | mutating func run() async throws { 65 | let db = try await VecturaCLI.setupDB( 66 | dbName: dbName, 67 | dimension: dimension, 68 | numResults: numResults, 69 | threshold: threshold 70 | ) 71 | 72 | // First, reset the database 73 | print("\n🧹 Resetting database...") 74 | try await db.reset() 75 | 76 | // Add sample documents 77 | print("\n📝 Adding sample documents...") 78 | let sampleTexts = [ 79 | "The quick brown fox jumps over the lazy dog", 80 | "To be or not to be, that is the question", 81 | "All that glitters is not gold", 82 | "A journey of a thousand miles begins with a single step", 83 | "Where there's smoke, there's fire", 84 | ] 85 | 86 | let ids = try await db.addDocuments(texts: sampleTexts, modelId: modelId) 87 | print("Added \(ids.count) documents:") 88 | for (id, text) in zip(ids, sampleTexts) { 89 | print("ID: \(id)") 90 | print("Text: \(text)") 91 | print("---") 92 | } 93 | 94 | // Search for documents 95 | print("\n🔍 Searching for 'journey'...") 96 | let results = try await db.search( 97 | query: "journey", 98 | numResults: numResults, 99 | threshold: threshold, 100 | modelId: modelId 101 | ) 102 | 103 | print("Found \(results.count) results:") 104 | for result in results { 105 | print("ID: \(result.id)") 106 | print("Text: \(result.text)") 107 | print("Score: \(result.score)") 108 | print("Created: \(result.createdAt)") 109 | print("---") 110 | } 111 | 112 | // Update a document 113 | if let firstId = ids.first { 114 | print("\n✏️ Updating first document...") 115 | let newText = "The quick red fox jumps over the sleeping dog" 116 | try await db.updateDocument(id: firstId, newText: newText, modelId: modelId) 117 | print("Updated document \(firstId) with new text: \(newText)") 118 | } 119 | 120 | // Delete last document 121 | if let lastId = ids.last { 122 | print("\n🗑️ Deleting last document...") 123 | try await db.deleteDocuments(ids: [lastId]) 124 | print("Deleted document \(lastId)") 125 | } 126 | 127 | print("\n✨ Mock demonstration completed!") 128 | } 129 | } 130 | 131 | struct Add: AsyncParsableCommand { 132 | static let configuration = CommandConfiguration( 133 | abstract: "Add documents to the vector database" 134 | ) 135 | 136 | @Option(name: [.long, .customShort("d")], help: "Database name") 137 | var dbName: String = "vectura-cli-db" 138 | 139 | @Option(name: [.long, .customShort("v")], help: "Vector dimension") 140 | var dimension: Int = 384 141 | 142 | @Option(name: [.long, .customShort("m")], help: "Model ID for embeddings") 143 | var modelId: String = "sentence-transformers/all-MiniLM-L6-v2" 144 | 145 | @Argument(help: "Text content to add") 146 | var text: [String] 147 | 148 | mutating func run() async throws { 149 | let db = try await VecturaCLI.setupDB( 150 | dbName: dbName, 151 | dimension: dimension, 152 | numResults: 10, 153 | threshold: 0.7 154 | ) 155 | let ids = try await db.addDocuments(texts: text, modelId: modelId) 156 | print("Added \(ids.count) documents:") 157 | for (id, text) in zip(ids, text) { 158 | print("ID: \(id)") 159 | print("Text: \(text)") 160 | print("---") 161 | } 162 | } 163 | } 164 | 165 | struct Search: AsyncParsableCommand { 166 | static let configuration = CommandConfiguration( 167 | abstract: "Search documents in the vector database" 168 | ) 169 | 170 | @Option(name: [.long, .customShort("d")], help: "Database name") 171 | var dbName: String = "vectura-cli-db" 172 | 173 | @Option(name: [.long, .customShort("v")], help: "Vector dimension") 174 | var dimension: Int = 384 175 | 176 | @Option(name: [.long, .customShort("t")], help: "Minimum similarity threshold") 177 | var threshold: Float = 0.7 178 | 179 | @Option(name: [.long, .customShort("n")], help: "Number of results to return") 180 | var numResults: Int = 10 181 | 182 | @Option(name: [.long, .customShort("m")], help: "Model ID for embeddings") 183 | var modelId: String = "sentence-transformers/all-MiniLM-L6-v2" 184 | 185 | @Argument(help: "Search query") 186 | var query: String 187 | 188 | mutating func run() async throws { 189 | let db = try await VecturaCLI.setupDB( 190 | dbName: dbName, 191 | dimension: dimension, 192 | numResults: numResults, 193 | threshold: threshold 194 | ) 195 | let results = try await db.search( 196 | query: query, 197 | numResults: numResults, 198 | threshold: threshold, 199 | modelId: modelId 200 | ) 201 | 202 | print("Found \(results.count) results:") 203 | for result in results { 204 | print("ID: \(result.id)") 205 | print("Text: \(result.text)") 206 | print("Score: \(result.score)") 207 | print("Created: \(result.createdAt)") 208 | print("---") 209 | } 210 | } 211 | } 212 | 213 | struct Update: AsyncParsableCommand, Decodable { 214 | static let configuration = CommandConfiguration( 215 | abstract: "Update a document in the vector database" 216 | ) 217 | 218 | @Option(name: [.long, .customShort("d")], help: "Database name") 219 | var dbName: String = "vectura-cli-db" 220 | 221 | @Option(name: [.long, .customShort("v")], help: "Vector dimension") 222 | var dimension: Int = 384 223 | 224 | @Option(name: [.long, .customShort("m")], help: "Model ID for embeddings") 225 | var modelId: String = "sentence-transformers/all-MiniLM-L6-v2" 226 | 227 | @Argument(help: "Document ID to update") 228 | var id: DocumentID 229 | 230 | @Argument(help: "New text content") 231 | var newText: String 232 | 233 | mutating func run() async throws { 234 | let db = try await VecturaCLI.setupDB( 235 | dbName: dbName, 236 | dimension: dimension, 237 | numResults: 10, 238 | threshold: 0.7 239 | ) 240 | try await db.updateDocument(id: id.uuid, newText: newText, modelId: modelId) 241 | print("Updated document \(id.uuid) with new text: \(newText)") 242 | } 243 | } 244 | 245 | struct Delete: AsyncParsableCommand, Decodable { 246 | static let configuration = CommandConfiguration( 247 | abstract: "Delete documents from the vector database" 248 | ) 249 | 250 | @Option(name: [.long, .customShort("d")], help: "Database name") 251 | var dbName: String = "vectura-cli-db" 252 | 253 | @Option(name: [.long, .customShort("v")], help: "Vector dimension") 254 | var dimension: Int = 384 255 | 256 | @Argument(help: "Document IDs to delete") 257 | var ids: [DocumentID] 258 | 259 | mutating func run() async throws { 260 | let db = try await VecturaCLI.setupDB( 261 | dbName: dbName, 262 | dimension: dimension, 263 | numResults: 10, 264 | threshold: 0.7 265 | ) 266 | try await db.deleteDocuments(ids: ids.map(\.uuid)) 267 | print("Deleted \(ids.count) documents") 268 | } 269 | } 270 | 271 | struct Reset: AsyncParsableCommand { 272 | static let configuration = CommandConfiguration( 273 | abstract: "Reset the vector database" 274 | ) 275 | 276 | @Option(name: [.long, .customShort("d")], help: "Database name") 277 | var dbName: String = "vectura-cli-db" 278 | 279 | @Option(name: [.long, .customShort("v")], help: "Vector dimension") 280 | var dimension: Int = 384 281 | 282 | mutating func run() async throws { 283 | let db = try await VecturaCLI.setupDB( 284 | dbName: dbName, 285 | dimension: dimension, 286 | numResults: 10, 287 | threshold: 0.7 288 | ) 289 | try await db.reset() 290 | print("Database reset successfully") 291 | } 292 | } 293 | } 294 | -------------------------------------------------------------------------------- /Sources/VecturaKit/BM25Index.swift: -------------------------------------------------------------------------------- 1 | // 2 | // BM25Index.swift 3 | // VecturaKit 4 | // 5 | // Created by Rudrank Riyam on 1/19/25. 6 | // 7 | 8 | import Foundation 9 | 10 | private func tokenize(_ text: String) -> [String] { 11 | text.lowercased() 12 | .folding(options: .diacriticInsensitive, locale: .current) 13 | .components(separatedBy: CharacterSet.alphanumerics.inverted) 14 | .filter { !$0.isEmpty } 15 | } 16 | 17 | /// An index for BM25-based text search over VecturaDocuments 18 | public struct BM25Index { 19 | private let k1: Float 20 | private let b: Float 21 | private var documents: [VecturaDocument] 22 | private var documentFrequencies: [String: Int] 23 | private var documentLengths: [UUID: Int] 24 | private var averageDocumentLength: Float 25 | 26 | /// Creates a new BM25 index for the given documents 27 | /// 28 | /// - Parameters: 29 | /// - documents: The documents to index 30 | /// - k1: BM25 k1 parameter (default: 1.2) 31 | /// - b: BM25 b parameter (default: 0.75) 32 | public init(documents: [VecturaDocument], k1: Float = 1.2, b: Float = 0.75) { 33 | self.k1 = k1 34 | self.b = b 35 | self.documents = documents 36 | self.documentFrequencies = [:] 37 | 38 | self.documentLengths = documents.reduce(into: [:]) { dict, doc in 39 | dict[doc.id] = tokenize(doc.text).count 40 | } 41 | 42 | self.averageDocumentLength = Float(documentLengths.values.reduce(0, +)) / Float(documents.count) 43 | 44 | for document in documents { 45 | let terms = Set(tokenize(document.text)) 46 | for term in terms { 47 | documentFrequencies[term, default: 0] += 1 48 | } 49 | } 50 | } 51 | 52 | /// Searches the index using BM25 scoring 53 | /// 54 | /// - Parameters: 55 | /// - query: The search query 56 | /// - topK: Maximum number of results to return 57 | /// - Returns: Array of tuples containing documents and their BM25 scores 58 | public func search(query: String, topK: Int = 10) -> [(document: VecturaDocument, score: Float)] { 59 | let queryTerms = tokenize(query) 60 | var scores: [(VecturaDocument, Float)] = [] 61 | 62 | for document in documents { 63 | let docLength = Float(documentLengths[document.id] ?? 0) 64 | var score: Float = 0.0 65 | 66 | for term in queryTerms { 67 | let tf = termFrequency(term: term, in: document) 68 | let df = Float(documentFrequencies[term] ?? 0) 69 | 70 | let idf = log((Float(documents.count) - df + 0.5) / (df + 0.5)) 71 | let numerator = tf * (k1 + 1) 72 | let denominator = tf + k1 * (1 - b + b * docLength / averageDocumentLength) 73 | 74 | score += idf * (numerator / denominator) 75 | } 76 | 77 | scores.append((document, score)) 78 | } 79 | 80 | return scores 81 | .sorted { $0.1 > $1.1 } 82 | .prefix(topK) 83 | .filter { $0.1 > 0 } 84 | } 85 | 86 | /// Add a new document to the index 87 | /// 88 | /// - Parameter document: The document to add 89 | public mutating func addDocument(_ document: VecturaDocument) { 90 | documents.append(document) 91 | 92 | let length = tokenize(document.text).count 93 | documentLengths[document.id] = length 94 | 95 | let terms = Set(tokenize(document.text)) 96 | for term in terms { 97 | documentFrequencies[term, default: 0] += 1 98 | } 99 | 100 | let totalLength = documentLengths.values.reduce(0, +) 101 | self.averageDocumentLength = Float(totalLength) / Float(documents.count) 102 | } 103 | 104 | private func termFrequency(term: String, in document: VecturaDocument) -> Float { 105 | Float( 106 | tokenize(document.text) 107 | .filter { $0 == term } 108 | .count) 109 | } 110 | } 111 | 112 | extension VecturaDocument { 113 | /// Calculates a hybrid search score combining vector similarity and BM25 114 | /// 115 | /// - Parameters: 116 | /// - vectorScore: The vector similarity score 117 | /// - bm25Score: The BM25 score 118 | /// - weight: Weight for vector score (0.0-1.0), BM25 weight will be (1-weight) 119 | /// - Returns: Combined score 120 | public func hybridScore(vectorScore: Float, bm25Score: Float, weight: Float = 0.5) -> Float { 121 | let normalizedBM25 = min(max(bm25Score / 10.0, 0), 1) 122 | return weight * vectorScore + (1 - weight) * normalizedBM25 123 | } 124 | } 125 | -------------------------------------------------------------------------------- /Sources/VecturaKit/FileStorageProvider.swift: -------------------------------------------------------------------------------- 1 | import Foundation 2 | import Accelerate 3 | 4 | /// A file‑based storage provider that implements VecturaStorage using JSON files. 5 | /// This provider maintains an in‑memory cache of documents while persisting them 6 | /// to a specified storage directory. 7 | public class FileStorageProvider: VecturaStorage { 8 | /// The storage directory where JSON files are stored. 9 | private let storageDirectory: URL 10 | 11 | /// In‑memory cache of documents keyed by their UUID. 12 | private var documents: [UUID: VecturaDocument] = [:] 13 | 14 | /// In‑memory cache of normalized embeddings for each document. 15 | private var normalizedEmbeddings: [UUID: [Float]] = [:] 16 | 17 | /// Initializes the provider with the target storage directory. 18 | /// 19 | /// - Parameter storageDirectory: The directory URL where documents will be saved and loaded. 20 | public init(storageDirectory: URL) throws { 21 | self.storageDirectory = storageDirectory 22 | 23 | // Ensure the storage directory exists 24 | if !FileManager.default.fileExists(atPath: storageDirectory.path) { 25 | try FileManager.default.createDirectory(at: storageDirectory, withIntermediateDirectories: true) 26 | } 27 | 28 | // Load any existing documents. 29 | try loadDocumentsFromStorage() 30 | } 31 | 32 | /// Ensures that the storage directory exists. 33 | public func createStorageDirectoryIfNeeded() async throws { 34 | if !FileManager.default.fileExists(atPath: storageDirectory.path) { 35 | try FileManager.default.createDirectory(at: storageDirectory, withIntermediateDirectories: true) 36 | } 37 | } 38 | 39 | /// Loads documents from in‑memory cache. 40 | /// This function returns the documents that were loaded during initialization. 41 | public func loadDocuments() async throws -> [VecturaDocument] { 42 | return Array(documents.values) 43 | } 44 | 45 | /// Saves a document by encoding it to JSON and writing it to disk. 46 | /// It also updates the in‑memory caches for the document and its normalized embedding. 47 | public func saveDocument(_ document: VecturaDocument) async throws { 48 | // Update cache 49 | documents[document.id] = document 50 | 51 | // Encode and write document to disk 52 | let encoder = JSONEncoder() 53 | encoder.outputFormatting = .prettyPrinted 54 | let data = try encoder.encode(document) 55 | let documentURL = storageDirectory.appendingPathComponent("\(document.id).json") 56 | try data.write(to: documentURL) 57 | 58 | // Compute and store normalized embedding 59 | let norm = l2Norm(document.embedding) 60 | var divisor = norm + 1e-9 61 | var normalized = [Float](repeating: 0, count: document.embedding.count) 62 | vDSP_vsdiv(document.embedding, 1, &divisor, &normalized, 1, vDSP_Length(document.embedding.count)) 63 | normalizedEmbeddings[document.id] = normalized 64 | } 65 | 66 | /// Deletes a document by removing it from the in‑memory caches and deleting its file. 67 | public func deleteDocument(withID id: UUID) async throws { 68 | // Remove from caches 69 | documents.removeValue(forKey: id) 70 | normalizedEmbeddings.removeValue(forKey: id) 71 | 72 | let documentURL = storageDirectory.appendingPathComponent("\(id).json") 73 | try FileManager.default.removeItem(at: documentURL) 74 | } 75 | 76 | /// Updates an existing document. 77 | /// This is implemented by saving the updated document, which overwrites the existing file. 78 | public func updateDocument(_ document: VecturaDocument) async throws { 79 | try await saveDocument(document) 80 | } 81 | 82 | // MARK: - Private Helper Methods 83 | 84 | /// Loads all JSON‑encoded documents from disk into memory. 85 | private func loadDocumentsFromStorage() throws { 86 | let fileURLs = try FileManager.default.contentsOfDirectory(at: storageDirectory, includingPropertiesForKeys: nil) 87 | let decoder = JSONDecoder() 88 | 89 | for fileURL in fileURLs where fileURL.pathExtension.lowercased() == "json" { 90 | do { 91 | let data = try Data(contentsOf: fileURL) 92 | let doc = try decoder.decode(VecturaDocument.self, from: data) 93 | documents[doc.id] = doc 94 | 95 | // Compute normalized embedding and store it. 96 | let norm = l2Norm(doc.embedding) 97 | var divisor = norm + 1e-9 98 | var normalized = [Float](repeating: 0, count: doc.embedding.count) 99 | vDSP_vsdiv(doc.embedding, 1, &divisor, &normalized, 1, vDSP_Length(doc.embedding.count)) 100 | normalizedEmbeddings[doc.id] = normalized 101 | } catch { 102 | // Log the error if needed 103 | print("Failed to load \(fileURL.lastPathComponent): \(error.localizedDescription)") 104 | } 105 | } 106 | } 107 | 108 | /// Computes the L2 norm of a vector. 109 | private func l2Norm(_ vector: [Float]) -> Float { 110 | var sumSquares: Float = 0 111 | vDSP_svesq(vector, 1, &sumSquares, vDSP_Length(vector.count)) 112 | return sqrt(sumSquares) 113 | } 114 | } 115 | -------------------------------------------------------------------------------- /Sources/VecturaKit/VecturaConfig.swift: -------------------------------------------------------------------------------- 1 | import Foundation 2 | 3 | /// Configuration options for Vectura vector database. 4 | public struct VecturaConfig { 5 | 6 | /// The name of the database instance. 7 | public let name: String 8 | 9 | /// A custom directory where the database should be stored. 10 | /// Will be created if it doesn't exist, database contents are stored in a subdirectory named after ``name``. 11 | public let directoryURL: URL? 12 | 13 | /// The dimension of vectors to be stored. 14 | public let dimension: Int 15 | 16 | /// Options for similarity search. 17 | public struct SearchOptions { 18 | /// The default number of results to return. 19 | public var defaultNumResults: Int = 10 20 | 21 | /// The minimum similarity threshold. 22 | public var minThreshold: Float? 23 | 24 | /// Weight for vector similarity in hybrid search (0.0-1.0) 25 | /// BM25 weight will be (1-hybridWeight) 26 | public var hybridWeight: Float = 0.5 27 | 28 | /// BM25 parameters 29 | public var k1: Float = 1.2 30 | public var b: Float = 0.75 31 | 32 | public init( 33 | defaultNumResults: Int = 10, 34 | minThreshold: Float? = nil, 35 | hybridWeight: Float = 0.5, 36 | k1: Float = 1.2, 37 | b: Float = 0.75 38 | ) { 39 | self.defaultNumResults = defaultNumResults 40 | self.minThreshold = minThreshold 41 | self.hybridWeight = hybridWeight 42 | self.k1 = k1 43 | self.b = b 44 | } 45 | } 46 | 47 | /// Search configuration options. 48 | public var searchOptions: SearchOptions 49 | 50 | public init( 51 | name: String, 52 | directoryURL: URL? = nil, 53 | dimension: Int, 54 | searchOptions: SearchOptions = SearchOptions() 55 | ) { 56 | self.name = name 57 | self.directoryURL = directoryURL 58 | self.dimension = dimension 59 | self.searchOptions = searchOptions 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /Sources/VecturaKit/VecturaDocument.swift: -------------------------------------------------------------------------------- 1 | import Foundation 2 | 3 | /// A document stored in the vector database. 4 | public struct VecturaDocument: Identifiable, Codable, Sendable { 5 | /// The unique identifier of the document. 6 | public let id: UUID 7 | 8 | /// The text content of the document. 9 | public let text: String 10 | 11 | /// The vector embedding of the document. 12 | public let embedding: [Float] 13 | 14 | /// The timestamp when the document was created. 15 | public let createdAt: Date 16 | 17 | /// Creates a new document with the given properties. 18 | /// - Parameters: 19 | /// - id: The unique identifier for the document. If nil, a new UUID will be generated. 20 | /// - text: The text content of the document. 21 | /// - embedding: The vector embedding of the document. 22 | public init(id: UUID? = nil, text: String, embedding: [Float]) { 23 | self.id = id ?? UUID() 24 | self.text = text 25 | self.embedding = embedding 26 | self.createdAt = Date() 27 | } 28 | 29 | // MARK: - Codable 30 | enum CodingKeys: String, CodingKey { 31 | case id, text, embedding, createdAt 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /Sources/VecturaKit/VecturaError.swift: -------------------------------------------------------------------------------- 1 | import Foundation 2 | 3 | /// Errors that can occur when using VecturaKit. 4 | public enum VecturaError: LocalizedError { 5 | /// Thrown when attempting to create a collection that already exists. 6 | case collectionAlreadyExists(String) 7 | 8 | /// Thrown when attempting to access a collection that doesn't exist. 9 | case collectionNotFound(String) 10 | 11 | /// Thrown when vector dimensions don't match. 12 | case dimensionMismatch(expected: Int, got: Int) 13 | 14 | /// Thrown when loading collection data fails. 15 | case loadFailed(String) 16 | 17 | /// Thrown when input validation fails. 18 | case invalidInput(String) 19 | 20 | public var errorDescription: String? { 21 | switch self { 22 | case .collectionAlreadyExists(let name): 23 | "A collection named '\(name)' already exists." 24 | case .collectionNotFound(let name): 25 | "Collection '\(name)' not found." 26 | case .dimensionMismatch(let expected, let got): 27 | "Vector dimension mismatch. Expected \(expected) but got \(got)." 28 | case .loadFailed(let reason): 29 | "Failed to load collection: \(reason)" 30 | case .invalidInput(let reason): 31 | "Invalid input: \(reason)" 32 | } 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /Sources/VecturaKit/VecturaKit.swift: -------------------------------------------------------------------------------- 1 | import Accelerate 2 | import CoreML 3 | import Embeddings 4 | import Foundation 5 | 6 | @available(macOS 15.0, iOS 18.0, tvOS 18.0, visionOS 2.0, watchOS 11.0, *) 7 | /// A vector database implementation that stores and searches documents using their vector embeddings. 8 | public class VecturaKit: VecturaProtocol { 9 | 10 | /// The configuration for this vector database instance. 11 | private let config: VecturaConfig 12 | 13 | /// In-memory cache of all documents. 14 | private var documents: [UUID: VecturaDocument] 15 | 16 | /// The storage directory for documents. 17 | private let storageDirectory: URL 18 | 19 | /// The storage provider that handles document persistence. 20 | private let storageProvider: VecturaStorage 21 | 22 | /// Cached normalized embeddings for faster searches. 23 | private var normalizedEmbeddings: [UUID: [Float]] = [:] 24 | 25 | /// BM25 index for text search 26 | private var bm25Index: BM25Index? 27 | 28 | /// Swift-Embeddings model bundle that you can reuse (e.g. BERT, XLM-R, CLIP, etc.) 29 | private var bertModel: Bert.ModelBundle? 30 | 31 | // MARK: - Initialization 32 | 33 | public init(config: VecturaConfig) async throws { 34 | self.config = config 35 | self.documents = [:] 36 | 37 | if let customStorageDirectory = config.directoryURL { 38 | let databaseDirectory = customStorageDirectory.appending(path: config.name) 39 | if !FileManager.default.fileExists(atPath: databaseDirectory.path(percentEncoded: false)) { 40 | try FileManager.default.createDirectory( 41 | at: databaseDirectory, withIntermediateDirectories: true) 42 | } 43 | self.storageDirectory = databaseDirectory 44 | } else { 45 | // Create default storage directory 46 | self.storageDirectory = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask) 47 | .first! 48 | .appendingPathComponent("VecturaKit") 49 | .appendingPathComponent(config.name) 50 | } 51 | 52 | try FileManager.default.createDirectory(at: storageDirectory, withIntermediateDirectories: true) 53 | 54 | // Instantiate the storage provider (currently the file-based implementation). 55 | self.storageProvider = try FileStorageProvider(storageDirectory: storageDirectory) 56 | 57 | // Load existing documents using the storage provider. 58 | let storedDocuments = try await storageProvider.loadDocuments() 59 | for doc in storedDocuments { 60 | self.documents[doc.id] = doc 61 | // Compute normalized embedding and store in cache. 62 | let norm = l2Norm(doc.embedding) 63 | var divisor = norm + 1e-9 64 | var normalized = [Float](repeating: 0, count: doc.embedding.count) 65 | vDSP_vsdiv(doc.embedding, 1, &divisor, &normalized, 1, vDSP_Length(doc.embedding.count)) 66 | self.normalizedEmbeddings[doc.id] = normalized 67 | } 68 | } 69 | 70 | /// Adds multiple documents to the vector store in batch. 71 | public func addDocuments( 72 | texts: [String], 73 | ids: [UUID]? = nil, 74 | model: VecturaModelSource = .default 75 | ) async throws -> [UUID] { 76 | if let ids = ids, ids.count != texts.count { 77 | throw VecturaError.invalidInput("Number of IDs must match number of texts") 78 | } 79 | 80 | if bertModel == nil { 81 | bertModel = try await Bert.loadModelBundle(from: model) 82 | } 83 | 84 | guard let modelBundle = bertModel else { 85 | throw VecturaError.invalidInput("Failed to load BERT model: \(model)") 86 | } 87 | 88 | let embeddingsTensor = try modelBundle.batchEncode(texts) 89 | let shape = embeddingsTensor.shape 90 | 91 | if shape.count != 2 { 92 | throw VecturaError.invalidInput("Expected shape [N, D], got \(shape)") 93 | } 94 | 95 | if shape[1] != config.dimension { 96 | throw VecturaError.dimensionMismatch( 97 | expected: config.dimension, 98 | got: shape[1] 99 | ) 100 | } 101 | 102 | let embeddingShapedArray = await embeddingsTensor.cast(to: Float.self).shapedArray( 103 | of: Float.self) 104 | let allScalars = embeddingShapedArray.scalars 105 | 106 | var documentIds = [UUID]() 107 | var documentsToSave = [VecturaDocument]() 108 | 109 | for i in 0.. [VecturaSearchResult] { 166 | if queryEmbedding.count != config.dimension { 167 | throw VecturaError.dimensionMismatch( 168 | expected: config.dimension, 169 | got: queryEmbedding.count 170 | ) 171 | } 172 | 173 | // Normalize the query vector 174 | let norm = l2Norm(queryEmbedding) 175 | var divisor = norm + 1e-9 176 | var normalizedQuery = [Float](repeating: 0, count: queryEmbedding.count) 177 | vDSP_vsdiv(queryEmbedding, 1, &divisor, &normalizedQuery, 1, vDSP_Length(queryEmbedding.count)) 178 | 179 | // Build a matrix of normalized document embeddings in row-major order 180 | var docIds = [UUID]() 181 | var matrix = [Float]() 182 | matrix.reserveCapacity(documents.count * config.dimension) // Pre-allocate for better performance 183 | 184 | for doc in documents.values { 185 | if let normalized = normalizedEmbeddings[doc.id] { 186 | docIds.append(doc.id) 187 | matrix.append(contentsOf: normalized) 188 | } 189 | } 190 | 191 | let docsCount = docIds.count 192 | if docsCount == 0 { 193 | return [] 194 | } 195 | 196 | let M = Int32(docsCount) // number of rows (documents) 197 | let N = Int32(config.dimension) // number of columns (embedding dimension) 198 | var similarities = [Float](repeating: 0, count: docsCount) 199 | 200 | // Convert Int32 to Int for LAPACK compatibility 201 | let mInt = Int(M) // Convert number of rows 202 | let nInt = Int(N) // Convert number of columns 203 | let ldInt = Int(N) // Convert leading dimension 204 | 205 | // Compute all similarities at once using matrix-vector multiplication 206 | // Matrix is in row-major order, so we use CblasNoTrans 207 | cblas_sgemv( 208 | CblasRowMajor, // matrix layout 209 | CblasNoTrans, // no transpose needed for row-major 210 | mInt, // number of rows (documents) as Int 211 | nInt, // number of columns (dimension) as Int 212 | 1.0, // alpha scaling factor 213 | matrix, // matrix 214 | ldInt, // leading dimension as Int 215 | normalizedQuery, // vector 216 | 1, // vector increment 217 | 0.0, // beta scaling factor 218 | &similarities, // result vector 219 | 1 // result increment 220 | ) 221 | 222 | // Construct the results 223 | var results = [VecturaSearchResult]() 224 | results.reserveCapacity(docsCount) // Pre-allocate for better performance 225 | 226 | for (i, similarity) in similarities.enumerated() { 227 | if let minT = threshold ?? config.searchOptions.minThreshold, similarity < minT { 228 | continue 229 | } 230 | if let doc = documents[docIds[i]] { 231 | results.append( 232 | VecturaSearchResult( 233 | id: doc.id, 234 | text: doc.text, 235 | score: similarity, 236 | createdAt: doc.createdAt 237 | ) 238 | ) 239 | } 240 | } 241 | 242 | results.sort { $0.score > $1.score } 243 | 244 | let limit = numResults ?? config.searchOptions.defaultNumResults 245 | return Array(results.prefix(limit)) 246 | } 247 | 248 | public func search( 249 | query: String, 250 | numResults: Int? = nil, 251 | threshold: Float? = nil, 252 | model: VecturaModelSource = .default 253 | ) async throws -> [VecturaSearchResult] { 254 | if bertModel == nil { 255 | bertModel = try await Bert.loadModelBundle(from: model) 256 | } 257 | 258 | guard let modelBundle = bertModel else { 259 | throw VecturaError.invalidInput("Failed to load BERT model: \(model)") 260 | } 261 | 262 | // Initialize BM25 index if needed 263 | if bm25Index == nil { 264 | let docs = documents.values.map { $0 } 265 | bm25Index = BM25Index( 266 | documents: docs, 267 | k1: config.searchOptions.k1, 268 | b: config.searchOptions.b 269 | ) 270 | } 271 | 272 | // Get vector similarity results 273 | let queryEmbeddingTensor = try modelBundle.encode(query) 274 | let queryEmbeddingFloatArray = await tensorToArray(queryEmbeddingTensor) 275 | let vectorResults = try await search( 276 | query: queryEmbeddingFloatArray, 277 | numResults: nil, 278 | threshold: nil 279 | ) 280 | 281 | let bm25Results = 282 | bm25Index?.search( 283 | query: query, 284 | topK: documents.count 285 | ) ?? [] 286 | 287 | // Create a map of document IDs to their BM25 scores 288 | let bm25Scores = Dictionary( 289 | bm25Results.map { ($0.document.id, $0.score) }, 290 | uniquingKeysWith: { first, _ in first } 291 | ) 292 | 293 | // Combine scores using hybrid scoring 294 | var hybridResults = vectorResults.map { result in 295 | let bm25Score = bm25Scores[result.id] ?? 0 296 | let hybridScore = VecturaDocument( 297 | id: result.id, 298 | text: result.text, 299 | embedding: [] 300 | ).hybridScore( 301 | vectorScore: result.score, 302 | bm25Score: bm25Score, 303 | weight: config.searchOptions.hybridWeight 304 | ) 305 | 306 | return VecturaSearchResult( 307 | id: result.id, 308 | text: result.text, 309 | score: hybridScore, 310 | createdAt: result.createdAt 311 | ) 312 | } 313 | 314 | hybridResults.sort { $0.score > $1.score } 315 | 316 | if let threshold = threshold ?? config.searchOptions.minThreshold { 317 | hybridResults = hybridResults.filter { $0.score >= threshold } 318 | } 319 | 320 | let limit = numResults ?? config.searchOptions.defaultNumResults 321 | return Array(hybridResults.prefix(limit)) 322 | } 323 | 324 | @_disfavoredOverload 325 | public func search( 326 | query: String, 327 | numResults: Int? = nil, 328 | threshold: Float? = nil, 329 | modelId: String = VecturaModelSource.defaultModelId 330 | ) async throws -> [VecturaSearchResult] { 331 | try await search( 332 | query: query, numResults: numResults, threshold: threshold, model: .id(modelId)) 333 | } 334 | 335 | public func reset() async throws { 336 | documents.removeAll() 337 | normalizedEmbeddings.removeAll() 338 | 339 | let files = try FileManager.default.contentsOfDirectory( 340 | at: storageDirectory, includingPropertiesForKeys: nil) 341 | for fileURL in files { 342 | try FileManager.default.removeItem(at: fileURL) 343 | } 344 | } 345 | 346 | public func deleteDocuments(ids: [UUID]) async throws { 347 | if bm25Index != nil { 348 | let remainingDocs = documents.values.filter { !ids.contains($0.id) } 349 | bm25Index = BM25Index( 350 | documents: Array(remainingDocs), 351 | k1: config.searchOptions.k1, 352 | b: config.searchOptions.b 353 | ) 354 | } 355 | 356 | for id in ids { 357 | documents[id] = nil 358 | normalizedEmbeddings[id] = nil 359 | 360 | let documentURL = storageDirectory.appendingPathComponent("\(id).json") 361 | try FileManager.default.removeItem(at: documentURL) 362 | } 363 | } 364 | 365 | public func updateDocument( 366 | id: UUID, 367 | newText: String, 368 | model: VecturaModelSource = .default 369 | ) async throws { 370 | try await deleteDocuments(ids: [id]) 371 | 372 | _ = try await addDocument(text: newText, id: id, model: model) 373 | } 374 | 375 | @_disfavoredOverload 376 | public func updateDocument( 377 | id: UUID, 378 | newText: String, 379 | modelId: String = VecturaModelSource.defaultModelId 380 | ) async throws { 381 | try await updateDocument(id: id, newText: newText, model: .id(modelId)) 382 | } 383 | 384 | // MARK: - Private 385 | 386 | private func tensorToArray(_ tensor: MLTensor) async -> [Float] { 387 | let shaped = await tensor.cast(to: Float.self).shapedArray(of: Float.self) 388 | return shaped.scalars 389 | } 390 | 391 | private func dotProduct(_ a: [Float], _ b: [Float]) -> Float { 392 | var result: Float = 0 393 | vDSP_dotpr(a, 1, b, 1, &result, vDSP_Length(a.count)) 394 | return result 395 | } 396 | 397 | private func l2Norm(_ v: [Float]) -> Float { 398 | var sumSquares: Float = 0 399 | vDSP_svesq(v, 1, &sumSquares, vDSP_Length(v.count)) 400 | return sqrt(sumSquares) 401 | } 402 | } 403 | 404 | @available(macOS 15.0, iOS 18.0, tvOS 18.0, visionOS 2.0, watchOS 11.0, *) 405 | extension Bert { 406 | static func loadModelBundle(from source: VecturaModelSource) async throws -> Bert.ModelBundle { 407 | switch source { 408 | case .id(let modelId): 409 | try await loadModelBundle(from: modelId) 410 | case .folder(let url): 411 | try await loadModelBundle(from: url) 412 | } 413 | } 414 | } 415 | -------------------------------------------------------------------------------- /Sources/VecturaKit/VecturaModelSource.swift: -------------------------------------------------------------------------------- 1 | import Foundation 2 | 3 | /// Specifies where to obtain the resources for an embedding model. 4 | public enum VecturaModelSource: Sendable, CustomStringConvertible { 5 | /// Automatically fetch the model from a remote repository based on its id. 6 | case id(_ id: String) 7 | /// Load a local model from the specified directory URL. 8 | case folder(_ url: URL) 9 | } 10 | 11 | public extension VecturaModelSource { 12 | /// The default model identifier when not otherwise specified. 13 | static let defaultModelId: String = "sentence-transformers/all-MiniLM-L6-v2" 14 | 15 | /// The default model when not otherwise specified. 16 | static let `default` = VecturaModelSource.id(VecturaModelSource.defaultModelId) 17 | } 18 | 19 | public extension VecturaModelSource { 20 | var description: String { 21 | switch self { 22 | case .id(let id): id 23 | case .folder(let url): url.path(percentEncoded: false) 24 | } 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /Sources/VecturaKit/VecturaProtocol.swift: -------------------------------------------------------------------------------- 1 | import Foundation 2 | 3 | /// A protocol defining the requirements for a vector database instance. 4 | public protocol VecturaProtocol { 5 | 6 | /// Adds multiple documents to the vector store in batch. 7 | /// 8 | /// - Parameters: 9 | /// - texts: The text contents of the documents. 10 | /// - ids: Optional unique identifiers for the documents. 11 | /// - model: A ``VecturaModelSource`` specifying how to load the model. 12 | /// (e.g.,`.id("sentence-transformers/all-MiniLM-L6-v2")`). 13 | /// - Returns: The IDs of the added documents. 14 | func addDocuments( 15 | texts: [String], 16 | ids: [UUID]?, 17 | model: VecturaModelSource 18 | ) async throws -> [UUID] 19 | 20 | /// Searches for similar documents using a *pre-computed query embedding*. 21 | /// 22 | /// - Parameters: 23 | /// - query: The query vector to search with. 24 | /// - numResults: Maximum number of results to return. 25 | /// - threshold: Minimum similarity threshold. 26 | /// - Returns: An array of search results ordered by similarity. 27 | func search( 28 | query: [Float], 29 | numResults: Int?, 30 | threshold: Float? 31 | ) async throws -> [VecturaSearchResult] 32 | 33 | /// Removes all documents from the vector store. 34 | func reset() async throws 35 | } 36 | 37 | // MARK: - Default Implementations 38 | 39 | public extension VecturaProtocol { 40 | 41 | /// Adds a document to the vector store by embedding text. 42 | /// 43 | /// - Parameters: 44 | /// - text: The text content of the document. 45 | /// - id: Optional unique identifier for the document. 46 | /// - model: A ``VecturaModelSource`` specifying how to load the model. 47 | /// (e.g.,`.id("sentence-transformers/all-MiniLM-L6-v2")`). 48 | /// - Returns: The ID of the added document. 49 | func addDocument( 50 | text: String, 51 | id: UUID? = nil, 52 | model: VecturaModelSource = .default 53 | ) async throws -> UUID { 54 | let ids = try await addDocuments( 55 | texts: [text], 56 | ids: id.map { [$0] }, 57 | model: model 58 | ) 59 | return ids[0] 60 | } 61 | 62 | /// Adds a document to the vector store by embedding text. 63 | /// 64 | /// - Parameters: 65 | /// - text: The text content of the document. 66 | /// - id: Optional unique identifier for the document. 67 | /// - modelId: Identifier of the model to use for generating the embedding 68 | /// (e.g., "sentence-transformers/all-MiniLM-L6-v2"). 69 | /// - Returns: The ID of the added document. 70 | @_disfavoredOverload 71 | func addDocument( 72 | text: String, 73 | id: UUID?, 74 | modelId: String = VecturaModelSource.defaultModelId 75 | ) async throws -> UUID { 76 | try await addDocument(text: text, id: id, model: .id(modelId)) 77 | } 78 | 79 | /// Adds multiple documents to the vector store in batch. 80 | /// 81 | /// - Parameters: 82 | /// - texts: The text contents of the documents. 83 | /// - ids: Optional unique identifiers for the documents. 84 | /// - modelId: Identifier of the model to use for generating the embedding 85 | /// (e.g.,`.id("sentence-transformers/all-MiniLM-L6-v2")`). 86 | /// - Returns: The IDs of the added documents. 87 | func addDocuments( 88 | texts: [String], 89 | ids: [UUID]? = nil, 90 | modelId: String = VecturaModelSource.defaultModelId 91 | ) async throws -> [UUID] { 92 | try await addDocuments(texts: texts, ids: ids, model: .id(modelId)) 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /Sources/VecturaKit/VecturaSearchResult.swift: -------------------------------------------------------------------------------- 1 | import Foundation 2 | 3 | /// Represents a search result from the vector database. 4 | public struct VecturaSearchResult: Identifiable, Sendable { 5 | 6 | /// The unique identifier of the matching document. 7 | public let id: UUID 8 | 9 | /// The text content of the matching document. 10 | public let text: String 11 | 12 | /// The similarity score between the query and the document. 13 | public let score: Float 14 | 15 | /// The timestamp when the document was created. 16 | public let createdAt: Date 17 | 18 | /// Creates a new search result with the given properties. 19 | /// 20 | /// - Parameters: 21 | /// - id: The unique identifier of the matching document. 22 | /// - text: The text content of the matching document. 23 | /// - score: The similarity score between the query and the document. 24 | /// - createdAt: The timestamp when the document was created. 25 | public init(id: UUID, text: String, score: Float, createdAt: Date) { 26 | self.id = id 27 | self.text = text 28 | self.score = score 29 | self.createdAt = createdAt 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /Sources/VecturaKit/VecturaStorage.swift: -------------------------------------------------------------------------------- 1 | import Foundation 2 | 3 | /// VecturaStorage protocol abstracts the persistence layer for VecturaDocuments. 4 | /// 5 | /// It allows for multiple underlying storage implementations (e.g., File-based or SQLite) 6 | /// without changing the higher-level API used in VecturaKit. 7 | public protocol VecturaStorage { 8 | /// Prepares or creates the storage location for documents if needed. 9 | func createStorageDirectoryIfNeeded() async throws 10 | 11 | /// Loads the persisted documents. 12 | /// 13 | /// - Returns: An array of VecturaDocument. 14 | func loadDocuments() async throws -> [VecturaDocument] 15 | 16 | /// Saves a document. 17 | /// 18 | /// - Parameter document: The document to save. 19 | func saveDocument(_ document: VecturaDocument) async throws 20 | 21 | /// Deletes a document by its unique identifier. 22 | /// 23 | /// - Parameter id: The identifier of the document to be deleted. 24 | func deleteDocument(withID id: UUID) async throws 25 | 26 | /// Updates an existing document. The document is replaced or modified as needed. 27 | /// 28 | /// - Parameter document: The updated document. 29 | func updateDocument(_ document: VecturaDocument) async throws 30 | } 31 | -------------------------------------------------------------------------------- /Sources/VecturaMLXCLI/VecturaMLXCLI.swift: -------------------------------------------------------------------------------- 1 | import ArgumentParser 2 | import Foundation 3 | import MLXEmbedders 4 | import VecturaKit 5 | import VecturaMLXKit 6 | 7 | @available(macOS 14.0, iOS 17.0, tvOS 17.0, visionOS 1.0, watchOS 10.0, *) 8 | @main 9 | struct VecturaMLXCLI: AsyncParsableCommand { 10 | struct DocumentID: ExpressibleByArgument, Decodable { 11 | let uuid: UUID 12 | 13 | init(_ uuid: UUID) { 14 | self.uuid = uuid 15 | } 16 | 17 | init?(argument: String) { 18 | guard let uuid = UUID(uuidString: argument) else { return nil } 19 | self.uuid = uuid 20 | } 21 | } 22 | 23 | static let configuration = CommandConfiguration( 24 | commandName: "vectura-mlx", 25 | abstract: "A CLI tool for VecturaMLXKit vector database using MLX", 26 | subcommands: [Add.self, Search.self, Update.self, Delete.self, Reset.self, Mock.self] 27 | ) 28 | 29 | static func setupDB( 30 | dbName: String, modelConfiguration: MLXEmbedders.ModelConfiguration = .nomic_text_v1_5 31 | ) 32 | async throws 33 | -> VecturaMLXKit 34 | { 35 | let config = VecturaConfig( 36 | name: dbName, 37 | dimension: 768 // nomic_text_v1_5 model outputs 768-dimensional embeddings 38 | ) 39 | return try await VecturaMLXKit(config: config, modelConfiguration: modelConfiguration) 40 | } 41 | } 42 | 43 | @available(macOS 14.0, iOS 17.0, tvOS 17.0, visionOS 1.0, watchOS 10.0, *) 44 | extension VecturaMLXCLI { 45 | struct Mock: AsyncParsableCommand { 46 | static let configuration = CommandConfiguration( 47 | abstract: "Run a mock demonstration with sample data" 48 | ) 49 | 50 | @Option(name: [.long, .customShort("d")], help: "Database name") 51 | var dbName: String = "vectura-mlx-cli-db" 52 | 53 | mutating func run() async throws { 54 | print("Starting mock command...") 55 | 56 | print("Setting up database...") 57 | let db = try await VecturaMLXCLI.setupDB(dbName: dbName) 58 | print("Database setup complete") 59 | 60 | // First, reset the database 61 | print("\n🧹 Resetting database...") 62 | try await db.reset() 63 | print("Reset complete") 64 | 65 | // Add sample documents 66 | print("\n📝 Adding sample documents...") 67 | let sampleTexts = [ 68 | "The quick brown fox jumps over the lazy dog", 69 | "To be or not to be, that is the question", 70 | "All that glitters is not gold", 71 | "A journey of a thousand miles begins with a single step", 72 | "Where there's smoke, there's fire", 73 | ] 74 | 75 | let ids = try await db.addDocuments(texts: sampleTexts) 76 | print("Added \(ids.count) documents:") 77 | for (id, text) in zip(ids, sampleTexts) { 78 | print("ID: \(id)") 79 | print("Text: \(text)") 80 | print("---") 81 | } 82 | 83 | // Search for documents 84 | print("\n🔍 Searching for 'journey'...") 85 | let results = try await db.search(query: "journey") 86 | 87 | print("Found \(results.count) results:") 88 | for result in results { 89 | print("ID: \(result.id)") 90 | print("Text: \(result.text)") 91 | print("Score: \(result.score)") 92 | print("Created: \(result.createdAt)") 93 | print("---") 94 | } 95 | 96 | // Update a document 97 | if let firstId = ids.first { 98 | print("\n✏️ Updating first document...") 99 | let newText = "The quick red fox jumps over the sleeping dog" 100 | try await db.updateDocument(id: firstId, newText: newText) 101 | print("Updated document \(firstId) with new text: \(newText)") 102 | } 103 | 104 | // Delete last document 105 | if let lastId = ids.last { 106 | print("\n🗑️ Deleting last document...") 107 | try await db.deleteDocuments(ids: [lastId]) 108 | print("Deleted document \(lastId)") 109 | } 110 | 111 | print("\n✨ Mock demonstration completed!") 112 | } 113 | } 114 | 115 | struct Add: AsyncParsableCommand { 116 | static let configuration = CommandConfiguration( 117 | abstract: "Add documents to the vector database" 118 | ) 119 | 120 | @Option(name: [.long, .customShort("d")], help: "Database name") 121 | var dbName: String = "vectura-mlx-cli-db" 122 | 123 | @Argument(help: "Text content to add") 124 | var text: [String] 125 | 126 | mutating func run() async throws { 127 | let db = try await VecturaMLXCLI.setupDB(dbName: dbName) 128 | let ids = try await db.addDocuments(texts: text) 129 | print("Added \(ids.count) documents:") 130 | for (id, text) in zip(ids, text) { 131 | print("ID: \(id)") 132 | print("Text: \(text)") 133 | print("---") 134 | } 135 | } 136 | } 137 | 138 | struct Search: AsyncParsableCommand { 139 | static let configuration = CommandConfiguration( 140 | abstract: "Search documents in the vector database" 141 | ) 142 | 143 | @Option(name: [.long, .customShort("d")], help: "Database name") 144 | var dbName: String = "vectura-mlx-cli-db" 145 | 146 | @Option(name: [.long, .customShort("t")], help: "Minimum similarity threshold") 147 | var threshold: Float? 148 | 149 | @Option(name: [.long, .customShort("n")], help: "Number of results to return") 150 | var numResults: Int? 151 | 152 | @Argument(help: "Search query") 153 | var query: String 154 | 155 | mutating func run() async throws { 156 | guard !query.isEmpty else { 157 | print("Error: Query cannot be empty.") 158 | throw ExitCode.failure 159 | } 160 | 161 | let db = try await VecturaMLXCLI.setupDB(dbName: dbName) 162 | let results = try await db.search( 163 | query: query, 164 | numResults: numResults, 165 | threshold: threshold 166 | ) 167 | 168 | print("Found \(results.count) results:") 169 | for result in results { 170 | print("ID: \(result.id)") 171 | print("Text: \(result.text)") 172 | print("Score: \(result.score)") 173 | print("Created: \(result.createdAt)") 174 | print("---") 175 | } 176 | } 177 | } 178 | 179 | struct Update: AsyncParsableCommand, Decodable { 180 | static let configuration = CommandConfiguration( 181 | abstract: "Update a document in the vector database" 182 | ) 183 | 184 | @Option(name: [.long, .customShort("d")], help: "Database name") 185 | var dbName: String = "vectura-mlx-cli-db" 186 | 187 | @Argument(help: "Document ID to update") 188 | var id: DocumentID 189 | 190 | @Argument(help: "New text content") 191 | var newText: String 192 | 193 | mutating func run() async throws { 194 | let db = try await VecturaMLXCLI.setupDB(dbName: dbName) 195 | try await db.updateDocument(id: id.uuid, newText: newText) 196 | print("Updated document \(id.uuid) with new text: \(newText)") 197 | } 198 | } 199 | 200 | struct Delete: AsyncParsableCommand, Decodable { 201 | static let configuration = CommandConfiguration( 202 | abstract: "Delete documents from the vector database" 203 | ) 204 | 205 | @Option(name: [.long, .customShort("d")], help: "Database name") 206 | var dbName: String = "vectura-mlx-cli-db" 207 | 208 | @Argument(help: "Document IDs to delete") 209 | var ids: [DocumentID] 210 | 211 | mutating func run() async throws { 212 | let db = try await VecturaMLXCLI.setupDB(dbName: dbName) 213 | try await db.deleteDocuments(ids: ids.map(\.uuid)) 214 | print("Deleted \(ids.count) documents") 215 | } 216 | } 217 | 218 | struct Reset: AsyncParsableCommand { 219 | static let configuration = CommandConfiguration( 220 | abstract: "Reset the vector database" 221 | ) 222 | 223 | @Option(name: [.long, .customShort("d")], help: "Database name") 224 | var dbName: String = "vectura-mlx-cli-db" 225 | 226 | mutating func run() async throws { 227 | let db = try await VecturaMLXCLI.setupDB(dbName: dbName) 228 | try await db.reset() 229 | print("Database reset successfully") 230 | } 231 | } 232 | } 233 | -------------------------------------------------------------------------------- /Sources/VecturaMLXKit/MLXEmbedder.swift: -------------------------------------------------------------------------------- 1 | import Foundation 2 | import MLX 3 | import MLXEmbedders 4 | import VecturaKit 5 | 6 | @available(macOS 14.0, iOS 17.0, tvOS 17.0, visionOS 1.0, watchOS 10.0, *) 7 | public class MLXEmbedder { 8 | private let modelContainer: ModelContainer 9 | private let configuration: ModelConfiguration 10 | 11 | public init(configuration: ModelConfiguration = .nomic_text_v1_5) async throws { 12 | self.configuration = configuration 13 | self.modelContainer = try await MLXEmbedders.loadModelContainer(configuration: configuration) 14 | } 15 | 16 | public func embed(texts: [String]) async -> [[Float]] { 17 | await modelContainer.perform { (model: EmbeddingModel, tokenizer, pooling) -> [[Float]] in 18 | let inputs = texts.map { 19 | tokenizer.encode(text: $0, addSpecialTokens: true) 20 | } 21 | 22 | // Pad to longest 23 | let maxLength = inputs.reduce(into: 16) { acc, elem in 24 | acc = max(acc, elem.count) 25 | } 26 | 27 | let padded = stacked( 28 | inputs.map { elem in 29 | MLXArray( 30 | elem 31 | + Array( 32 | repeating: tokenizer.eosTokenId ?? 0, 33 | count: maxLength - elem.count)) 34 | }) 35 | 36 | let mask = (padded .!= tokenizer.eosTokenId ?? 0) 37 | let tokenTypes = MLXArray.zeros(like: padded) 38 | 39 | let result = pooling( 40 | model(padded, positionIds: nil, tokenTypeIds: tokenTypes, attentionMask: mask), 41 | normalize: true, applyLayerNorm: true 42 | ) 43 | 44 | return result.map { $0.asArray(Float.self) } 45 | } 46 | } 47 | 48 | public func embed(text: String) async throws -> [Float] { 49 | let embeddings = await embed(texts: [text]) 50 | return embeddings[0] 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /Sources/VecturaMLXKit/VecturaMLXKit.swift: -------------------------------------------------------------------------------- 1 | import Accelerate 2 | import Foundation 3 | import MLXEmbedders 4 | import VecturaKit 5 | 6 | @available(macOS 14.0, iOS 17.0, tvOS 17.0, visionOS 1.0, watchOS 10.0, *) 7 | public class VecturaMLXKit { 8 | private let config: VecturaConfig 9 | private let embedder: MLXEmbedder 10 | private var documents: [UUID: VecturaDocument] = [:] 11 | private var normalizedEmbeddings: [UUID: [Float]] = [:] 12 | private let storageDirectory: URL 13 | 14 | public init(config: VecturaConfig, modelConfiguration: ModelConfiguration = .nomic_text_v1_5) 15 | async throws 16 | { 17 | self.config = config 18 | self.embedder = try await MLXEmbedder(configuration: modelConfiguration) 19 | 20 | if let customStorageDirectory = config.directoryURL { 21 | let databaseDirectory = customStorageDirectory.appending(path: config.name) 22 | 23 | if !FileManager.default.fileExists(atPath: databaseDirectory.path(percentEncoded: false)) { 24 | try FileManager.default.createDirectory( 25 | at: databaseDirectory, withIntermediateDirectories: true) 26 | } 27 | 28 | self.storageDirectory = databaseDirectory 29 | } else { 30 | // Create default storage directory 31 | self.storageDirectory = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask) 32 | .first! 33 | .appendingPathComponent("VecturaKit") 34 | .appendingPathComponent(config.name) 35 | } 36 | 37 | try FileManager.default.createDirectory(at: storageDirectory, withIntermediateDirectories: true) 38 | 39 | // Attempt to load existing docs 40 | try loadDocuments() 41 | } 42 | 43 | public func addDocuments(texts: [String], ids: [UUID]? = nil) async throws -> [UUID] { 44 | if let ids = ids, ids.count != texts.count { 45 | throw VecturaError.invalidInput("Number of IDs must match number of texts") 46 | } 47 | 48 | let embeddings = await embedder.embed(texts: texts) 49 | var documentIds = [UUID]() 50 | var documentsToSave = [VecturaDocument]() 51 | 52 | for (index, text) in texts.enumerated() { 53 | let docId = ids?[index] ?? UUID() 54 | let doc = VecturaDocument(id: docId, text: text, embedding: embeddings[index]) 55 | 56 | // Normalize embedding for cosine similarity 57 | let norm = l2Norm(doc.embedding) 58 | var divisor = norm + 1e-9 59 | var normalized = [Float](repeating: 0, count: doc.embedding.count) 60 | vDSP_vsdiv(doc.embedding, 1, &divisor, &normalized, 1, vDSP_Length(doc.embedding.count)) 61 | 62 | normalizedEmbeddings[doc.id] = normalized 63 | documents[doc.id] = doc 64 | documentIds.append(docId) 65 | documentsToSave.append(doc) 66 | } 67 | 68 | try await withThrowingTaskGroup(of: Void.self) { group in 69 | let directory = self.storageDirectory 70 | 71 | for doc in documentsToSave { 72 | group.addTask { 73 | let documentURL = directory.appendingPathComponent("\(doc.id).json") 74 | let encoder = JSONEncoder() 75 | encoder.outputFormatting = .prettyPrinted 76 | 77 | let data = try encoder.encode(doc) 78 | try data.write(to: documentURL) 79 | } 80 | } 81 | 82 | try await group.waitForAll() 83 | } 84 | 85 | return documentIds 86 | } 87 | 88 | public func search(query: String, numResults: Int? = nil, threshold: Float? = nil) async throws 89 | -> [VecturaSearchResult] 90 | { 91 | guard !query.isEmpty else { 92 | throw VecturaError.invalidInput("Query cannot be empty") 93 | } 94 | 95 | let queryEmbedding = try await embedder.embed(text: query) 96 | 97 | let norm = l2Norm(queryEmbedding) 98 | var divisorQuery = norm + 1e-9 99 | var normalizedQuery = [Float](repeating: 0, count: queryEmbedding.count) 100 | vDSP_vsdiv( 101 | queryEmbedding, 1, &divisorQuery, &normalizedQuery, 1, vDSP_Length(queryEmbedding.count)) 102 | 103 | var results: [VecturaSearchResult] = [] 104 | 105 | for doc in documents.values { 106 | guard let normDoc = normalizedEmbeddings[doc.id] else { continue } 107 | let similarity = dotProduct(normalizedQuery, normDoc) 108 | 109 | if let minT = threshold ?? config.searchOptions.minThreshold, similarity < minT { 110 | continue 111 | } 112 | 113 | results.append( 114 | VecturaSearchResult( 115 | id: doc.id, 116 | text: doc.text, 117 | score: similarity, 118 | createdAt: doc.createdAt 119 | ) 120 | ) 121 | } 122 | 123 | results.sort { $0.score > $1.score } 124 | 125 | let limit = numResults ?? config.searchOptions.defaultNumResults 126 | return Array(results.prefix(limit)) 127 | } 128 | 129 | public func deleteDocuments(ids: [UUID]) async throws { 130 | for id in ids { 131 | documents[id] = nil 132 | normalizedEmbeddings[id] = nil 133 | 134 | let documentURL = storageDirectory.appendingPathComponent("\(id).json") 135 | try FileManager.default.removeItem(at: documentURL) 136 | } 137 | } 138 | 139 | public func updateDocument(id: UUID, newText: String) async throws { 140 | try await deleteDocuments(ids: [id]) 141 | _ = try await addDocuments(texts: [newText], ids: [id]) 142 | } 143 | 144 | public func reset() async throws { 145 | documents.removeAll() 146 | normalizedEmbeddings.removeAll() 147 | 148 | let files = try FileManager.default.contentsOfDirectory( 149 | at: storageDirectory, includingPropertiesForKeys: nil) 150 | for fileURL in files { 151 | try FileManager.default.removeItem(at: fileURL) 152 | } 153 | } 154 | 155 | // MARK: - Private 156 | 157 | private func loadDocuments() throws { 158 | let fileURLs = try FileManager.default.contentsOfDirectory( 159 | at: storageDirectory, includingPropertiesForKeys: nil) 160 | 161 | let decoder = JSONDecoder() 162 | var loadErrors: [String] = [] 163 | 164 | for fileURL in fileURLs where fileURL.pathExtension == "json" { 165 | do { 166 | let data = try Data(contentsOf: fileURL) 167 | let doc = try decoder.decode(VecturaDocument.self, from: data) 168 | 169 | // Rebuild normalized embeddings 170 | let norm = l2Norm(doc.embedding) 171 | var divisor = norm + 1e-9 172 | var normalized = [Float](repeating: 0, count: doc.embedding.count) 173 | vDSP_vsdiv(doc.embedding, 1, &divisor, &normalized, 1, vDSP_Length(doc.embedding.count)) 174 | normalizedEmbeddings[doc.id] = normalized 175 | documents[doc.id] = doc 176 | } catch { 177 | loadErrors.append( 178 | "Failed to load \(fileURL.lastPathComponent): \(error.localizedDescription)") 179 | } 180 | } 181 | 182 | if !loadErrors.isEmpty { 183 | throw VecturaError.loadFailed(loadErrors.joined(separator: "\n")) 184 | } 185 | } 186 | 187 | private func dotProduct(_ a: [Float], _ b: [Float]) -> Float { 188 | var result: Float = 0 189 | vDSP_dotpr(a, 1, b, 1, &result, vDSP_Length(a.count)) 190 | return result 191 | } 192 | 193 | private func l2Norm(_ v: [Float]) -> Float { 194 | var sumSquares: Float = 0 195 | vDSP_svesq(v, 1, &sumSquares, vDSP_Length(v.count)) 196 | return sqrt(sumSquares) 197 | } 198 | } 199 | -------------------------------------------------------------------------------- /Tests/VecturaKitTests/VecturaKitTests.swift: -------------------------------------------------------------------------------- 1 | import XCTest 2 | 3 | @testable import VecturaKit 4 | import Embeddings 5 | 6 | @available(macOS 15.0, iOS 18.0, tvOS 18.0, visionOS 2.0, watchOS 11.0, *) 7 | final class VecturaKitTests: XCTestCase { 8 | var vectura: VecturaKit! 9 | var config: VecturaConfig! 10 | 11 | override func setUp() async throws { 12 | config = VecturaConfig(name: "test-db", dimension: 384) 13 | vectura = try VecturaKit(config: config) 14 | } 15 | 16 | override func tearDown() async throws { 17 | try await vectura.reset() 18 | vectura = nil 19 | } 20 | 21 | func testAddAndSearchDocument() async throws { 22 | let text = "This is a test document" 23 | let id = try await vectura.addDocument(text: text) 24 | 25 | let results = try await vectura.search(query: "test document") 26 | XCTAssertEqual(results.count, 1) 27 | XCTAssertEqual(results[0].id, id) 28 | XCTAssertEqual(results[0].text, text) 29 | } 30 | 31 | func testAddMultipleDocuments() async throws { 32 | let documents = [ 33 | "The quick brown fox jumps over the lazy dog", 34 | "Pack my box with five dozen liquor jugs", 35 | "How vexingly quick daft zebras jump", 36 | ] 37 | 38 | let ids = try await vectura.addDocuments(texts: documents) 39 | XCTAssertEqual(ids.count, 3) 40 | 41 | let results = try await vectura.search(query: "quick jumping animals") 42 | XCTAssertGreaterThanOrEqual(results.count, 2) 43 | XCTAssertTrue(results[0].score > results[1].score) 44 | } 45 | 46 | func testPersistence() async throws { 47 | // Add documents 48 | let texts = ["Document 1", "Document 2"] 49 | let ids = try await vectura.addDocuments(texts: texts) 50 | 51 | // Create new instance with same config 52 | let config = VecturaConfig(name: "test-db", dimension: 384) 53 | let newVectura = try VecturaKit(config: config) 54 | 55 | // Search should work with new instance 56 | let results = try await newVectura.search(query: "Document") 57 | XCTAssertEqual(results.count, 2) 58 | XCTAssertTrue(ids.contains(results[0].id)) 59 | XCTAssertTrue(ids.contains(results[1].id)) 60 | } 61 | 62 | func testSearchThreshold() async throws { 63 | let documents = [ 64 | "Very relevant document about cats", 65 | "Somewhat relevant about pets", 66 | "Completely irrelevant about weather", 67 | ] 68 | _ = try await vectura.addDocuments(texts: documents) 69 | 70 | // With high threshold, should get fewer results 71 | let results = try await vectura.search(query: "cats and pets", threshold: 0.8) 72 | XCTAssertLessThan(results.count, 3) 73 | } 74 | 75 | func testCustomIds() async throws { 76 | let customId = UUID() 77 | let text = "Document with custom ID" 78 | 79 | let resultId = try await vectura.addDocument(text: text, id: customId) 80 | XCTAssertEqual(customId, resultId) 81 | 82 | let results = try await vectura.search(query: text) 83 | XCTAssertEqual(results[0].id, customId) 84 | } 85 | 86 | func testModelReuse() async throws { 87 | // Multiple operations should reuse the same model 88 | let start = Date() 89 | for i in 1...5 { 90 | _ = try await vectura.addDocument(text: "Test document \(i)") 91 | } 92 | let duration = Date().timeIntervalSince(start) 93 | 94 | // If model is being reused, this should be relatively quick 95 | XCTAssertLessThan(duration, 5.0) // Adjust threshold as needed 96 | } 97 | 98 | func testEmptySearch() async throws { 99 | let results = try await vectura.search(query: "test query") 100 | XCTAssertEqual(results.count, 0, "Search on empty database should return no results") 101 | } 102 | 103 | func testDimensionMismatch() async throws { 104 | // Test with wrong dimension config 105 | let wrongConfig = VecturaConfig(name: "wrong-dim-db", dimension: 128) 106 | let wrongVectura = try VecturaKit(config: wrongConfig) 107 | 108 | let text = "Test document" 109 | 110 | do { 111 | _ = try await wrongVectura.addDocument(text: text) 112 | XCTFail("Expected dimension mismatch error") 113 | } catch let error as VecturaError { 114 | // Should throw dimension mismatch since BERT model outputs 384 dimensions 115 | switch error { 116 | case .dimensionMismatch(let expected, let got): 117 | XCTAssertEqual(expected, 128) 118 | XCTAssertEqual(got, 384) 119 | default: 120 | XCTFail("Wrong error type: \(error)") 121 | } 122 | } 123 | } 124 | 125 | func testDuplicateIds() async throws { 126 | let id = UUID() 127 | let text1 = "First document" 128 | let text2 = "Second document" 129 | 130 | // Add first document 131 | _ = try await vectura.addDocument(text: text1, id: id) 132 | 133 | // Adding second document with same ID should overwrite 134 | _ = try await vectura.addDocument(text: text2, id: id) 135 | 136 | let results = try await vectura.search(query: text2) 137 | XCTAssertEqual(results.count, 1) 138 | XCTAssertEqual(results[0].text, text2) 139 | } 140 | 141 | func testSearchThresholdEdgeCases() async throws { 142 | let documents = ["Test document"] 143 | _ = try await vectura.addDocuments(texts: documents) 144 | 145 | // Test with threshold = 1.0 (exact match only) 146 | let perfectResults = try await vectura.search(query: "Test document", threshold: 1.0) 147 | XCTAssertEqual(perfectResults.count, 0) // Should find no perfect matches due to encoding differences 148 | 149 | // Test with threshold = 0.0 (all matches) 150 | let allResults = try await vectura.search(query: "completely different", threshold: 0.0) 151 | XCTAssertEqual(allResults.count, 1) // Should return all documents 152 | } 153 | 154 | func testLargeNumberOfDocuments() async throws { 155 | let documentCount = 100 156 | var documents: [String] = [] 157 | 158 | for i in 0.. results[1].score) 210 | } 211 | 212 | func testCustomStorageDirectory() async throws { 213 | let customDirectoryURL = URL(filePath: NSTemporaryDirectory()).appending(path: "VecturaKitTest") 214 | defer { try? FileManager.default.removeItem(at: customDirectoryURL) } 215 | 216 | let instance = try VecturaKit(config: .init(name: "test", directoryURL: customDirectoryURL, dimension: 384)) 217 | let text = "Test document" 218 | let id = UUID() 219 | _ = try await instance.addDocument(text: text, id: id) 220 | 221 | let documentPath = customDirectoryURL.appending(path: "test/\(id).json").path(percentEncoded: false) 222 | XCTAssertTrue(FileManager.default.fileExists(atPath: documentPath), "Custom storage directory inserted document doesn't exist at \(documentPath)") 223 | } 224 | } 225 | -------------------------------------------------------------------------------- /Tests/VecturaMLXKitTests/VecturaMLXKitTests.swift: -------------------------------------------------------------------------------- 1 | import XCTest 2 | import Foundation 3 | @testable import VecturaMLXKit 4 | @testable import VecturaKit 5 | 6 | @available(macOS 14.0, iOS 17.0, tvOS 17.0, watchOS 10.0, *) 7 | final class VecturaMLXKitTests: XCTestCase { 8 | 9 | var testDirectory: URL! 10 | // Set a dimension matching your model expectation (e.g., 768) 11 | let testDimension = 768 12 | 13 | override func setUpWithError() throws { 14 | // Create a temporary directory for testing. 15 | let temp = FileManager.default.temporaryDirectory 16 | testDirectory = temp.appendingPathComponent("VecturaMLXKitTests", isDirectory: true) 17 | if FileManager.default.fileExists(atPath: testDirectory.path) { 18 | try FileManager.default.removeItem(at: testDirectory) 19 | } 20 | try FileManager.default.createDirectory(at: testDirectory, withIntermediateDirectories: true) 21 | } 22 | 23 | override func tearDownWithError() throws { 24 | // Clean up the temporary directory. 25 | if FileManager.default.fileExists(atPath: testDirectory.path) { 26 | try FileManager.default.removeItem(at: testDirectory) 27 | } 28 | } 29 | 30 | func testAddAndSearch() async throws { 31 | // Create a test config with a minThreshold of 0 so any document is returned. 32 | let config = VecturaConfig( 33 | name: "TestDB", 34 | directoryURL: testDirectory, 35 | dimension: testDimension, 36 | searchOptions: VecturaConfig.SearchOptions(defaultNumResults: 10, minThreshold: 0, hybridWeight: 0.5, k1: 1.2, b: 0.75) 37 | ) 38 | let kit = try await VecturaMLXKit(config: config, modelConfiguration: .nomic_text_v1_5) 39 | 40 | let text = "Hello world" 41 | let ids = try await kit.addDocuments(texts: [text]) 42 | XCTAssertEqual(ids.count, 1, "Should add exactly one document.") 43 | 44 | // Perform a search using the same text. 45 | let results = try await kit.search(query: text) 46 | XCTAssertEqual(results.count, 1, "The search should return one result after adding one document.") 47 | XCTAssertEqual(results.first?.text, text, "The text of the returned document should match the added text.") 48 | } 49 | 50 | func testDeleteDocuments() async throws { 51 | let config = VecturaConfig( 52 | name: "TestDB", 53 | directoryURL: testDirectory, 54 | dimension: testDimension, 55 | searchOptions: VecturaConfig.SearchOptions(defaultNumResults: 10, minThreshold: 0, hybridWeight: 0.5, k1: 1.2, b: 0.75) 56 | ) 57 | let kit = try await VecturaMLXKit(config: config, modelConfiguration: .nomic_text_v1_5) 58 | 59 | let text = "Delete me" 60 | let ids = try await kit.addDocuments(texts: [text]) 61 | XCTAssertEqual(ids.count, 1, "Should add exactly one document.") 62 | 63 | try await kit.deleteDocuments(ids: ids) 64 | 65 | let results = try await kit.search(query: text) 66 | XCTAssertTrue(results.isEmpty, "After deletion, the document should not be returned in search results.") 67 | } 68 | 69 | func testUpdateDocument() async throws { 70 | let config = VecturaConfig( 71 | name: "TestDB", 72 | directoryURL: testDirectory, 73 | dimension: testDimension, 74 | searchOptions: VecturaConfig.SearchOptions(defaultNumResults: 10, minThreshold: 0, hybridWeight: 0.5, k1: 1.2, b: 0.75) 75 | ) 76 | let kit = try await VecturaMLXKit(config: config, modelConfiguration: .nomic_text_v1_5) 77 | 78 | let originalText = "Original text" 79 | let updatedText = "Updated text" 80 | let ids = try await kit.addDocuments(texts: [originalText]) 81 | XCTAssertEqual(ids.count, 1, "Should add exactly one document.") 82 | 83 | let documentID = ids.first! 84 | try await kit.updateDocument(id: documentID, newText: updatedText) 85 | 86 | let results = try await kit.search(query: updatedText) 87 | XCTAssertEqual(results.count, 1, "One document should be returned after update.") 88 | XCTAssertEqual(results.first?.text, updatedText, "The document text should be updated in the search results.") 89 | } 90 | 91 | func testReset() async throws { 92 | let config = VecturaConfig( 93 | name: "TestDB", 94 | directoryURL: testDirectory, 95 | dimension: testDimension, 96 | searchOptions: VecturaConfig.SearchOptions(defaultNumResults: 10, minThreshold: 0, hybridWeight: 0.5, k1: 1.2, b: 0.75) 97 | ) 98 | let kit = try await VecturaMLXKit(config: config, modelConfiguration: .nomic_text_v1_5) 99 | 100 | _ = try await kit.addDocuments(texts: ["Doc1", "Doc2"]) 101 | try await kit.reset() 102 | 103 | let results = try await kit.search(query: "Doc") 104 | XCTAssertTrue(results.isEmpty, "After a reset, search should return no results.") 105 | } 106 | 107 | // MARK: - Robust Search Tests 108 | 109 | func testSearchMultipleDocuments() async throws { 110 | let config = VecturaConfig( 111 | name: "TestMLXDB", 112 | directoryURL: testDirectory, 113 | dimension: testDimension, 114 | searchOptions: VecturaConfig.SearchOptions(defaultNumResults: 10, minThreshold: 0, hybridWeight: 0.5, k1: 1.2, b: 0.75) 115 | ) 116 | let kit = try await VecturaMLXKit(config: config, modelConfiguration: .nomic_text_v1_5) 117 | 118 | // Add several documents with overlapping keywords. 119 | let texts = [ 120 | "The quick brown fox jumps over the lazy dog", 121 | "A fast brown fox leaps over lazy hounds", 122 | "An agile brown fox", 123 | "Lazy dogs sleep all day", 124 | "Quick and nimble foxes" 125 | ] 126 | _ = try await kit.addDocuments(texts: texts) 127 | 128 | // Search for an expression close to "brown fox". 129 | let results = try await kit.search(query: "brown fox") 130 | 131 | // We expect at least two results related to 'brown fox'. 132 | XCTAssertGreaterThanOrEqual(results.count, 2, "Should return at least two documents related to 'brown fox'.") 133 | 134 | // Verify that results are sorted in descending order by score. 135 | for i in 1..