├── .github
└── workflows
│ ├── build_and_test_mlx.yml
│ ├── build_and_test_vectura.yml
│ └── update-readme.yml
├── .gitignore
├── .swiftpm
└── xcode
│ ├── package.xcworkspace
│ └── contents.xcworkspacedata
│ └── xcshareddata
│ └── xcschemes
│ ├── VecturaKit-Package.xcscheme
│ ├── VecturaKit.xcscheme
│ ├── VecturaKitTests.xcscheme
│ ├── VecturaMLXKit.xcscheme
│ ├── VecturaMLXKitTests.xcscheme
│ ├── vectura-cli.xcscheme
│ └── vectura-mlx-cli.xcscheme
├── .vscode
├── launch.json
└── settings.json
├── LICENSE
├── Package.resolved
├── Package.swift
├── README.md
├── Sources
├── VecturaCLI
│ └── VecturaCLI.swift
├── VecturaKit
│ ├── BM25Index.swift
│ ├── FileStorageProvider.swift
│ ├── VecturaConfig.swift
│ ├── VecturaDocument.swift
│ ├── VecturaError.swift
│ ├── VecturaKit.swift
│ ├── VecturaModelSource.swift
│ ├── VecturaProtocol.swift
│ ├── VecturaSearchResult.swift
│ └── VecturaStorage.swift
├── VecturaMLXCLI
│ └── VecturaMLXCLI.swift
└── VecturaMLXKit
│ ├── MLXEmbedder.swift
│ └── VecturaMLXKit.swift
├── Tests
├── VecturaKitTests
│ └── VecturaKitTests.swift
└── VecturaMLXKitTests
│ └── VecturaMLXKitTests.swift
└── scripts
└── update_readme.py
/.github/workflows/build_and_test_mlx.yml:
--------------------------------------------------------------------------------
1 | name: "VecturaMLX CI"
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 | pull_request:
8 |
9 | jobs:
10 | build-and-test:
11 | runs-on: macos-15
12 | env:
13 | DEVELOPER_DIR: "/Applications/Xcode_16.1.app/Contents/Developer"
14 | steps:
15 | - name: Checkout code
16 | uses: actions/checkout@v4
17 | - name: Setup Swift Toolchain
18 | uses: swift-actions/setup-swift@v2
19 | with:
20 | swift-version: "6.0"
21 | - name: Build target VecturaMLXKit
22 | run: xcodebuild -scheme "VecturaMLXKit" build -destination 'platform=macOS'
23 | - name: Run tests
24 | run: xcodebuild -scheme "VecturaMLXKitTests" test -destination 'platform=macOS'
--------------------------------------------------------------------------------
/.github/workflows/build_and_test_vectura.yml:
--------------------------------------------------------------------------------
1 | name: "VecturaKit CI"
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 | pull_request:
8 |
9 | jobs:
10 | build-and-test:
11 | runs-on: macos-15
12 | env:
13 | DEVELOPER_DIR: "/Applications/Xcode_16.1.app/Contents/Developer"
14 | strategy:
15 | matrix:
16 | target: ["VecturaKit", "vectura-cli"]
17 | steps:
18 | - uses: actions/checkout@v4
19 | - name: Build ${{ matrix.target }}
20 | run: swift build --product "${{ matrix.target }}"
21 | - name: Test ${{ matrix.target }}
22 | run: swift test --filter "${{ matrix.target }}Tests"
--------------------------------------------------------------------------------
/.github/workflows/update-readme.yml:
--------------------------------------------------------------------------------
1 | name: Update README
2 |
3 | on:
4 | push:
5 | # Run on pushes to main (or any branch you prefer)
6 | branches: [main]
7 |
8 | permissions:
9 | contents: write
10 | pull-requests: write
11 |
12 | jobs:
13 | update-readme:
14 | # Skip this job if the commit message indicates a merge from docs/update-readme.
15 | # Adjust the string in the contains() check if your merge commit message is different.
16 | if: "!contains(github.event.head_commit.message, 'docs/update-readme')"
17 | runs-on: ubuntu-latest
18 |
19 | steps:
20 | # 1. Check out the repository
21 | - name: Checkout repository
22 | uses: actions/checkout@v4
23 |
24 | # 2. Set up Python (make sure you choose a suitable version)
25 | - name: Set up Python
26 | uses: actions/setup-python@v5
27 | with:
28 | python-version: '3.x'
29 |
30 | # 3. Install Gemini package
31 | - name: Install Gemini package
32 | run: pip install google-genai
33 |
34 | # 4. Run the Python script that calls GeminiAI
35 | - name: Run README updater script
36 | env:
37 | GEMINI_API_KEY: "${{ secrets.GEMINI_API_KEY }}"
38 | run: python3 scripts/update_readme.py
39 |
40 | # 5. Open a Pull Request using an action (this one automates creating a PR)
41 | - name: Create Pull Request
42 | uses: peter-evans/create-pull-request@v4
43 | with:
44 | token: ${{ secrets.GITHUB_TOKEN }}
45 | commit-message: "docs: update README.md based on codebase"
46 | title: "docs: update README.md"
47 | body: |
48 | This PR updates the README.md file based on the current codebase using Gemini AI.
49 |
50 | - Automatically generated by GitHub Actions
51 | branch: docs/update-readme
52 | base: main
53 | delete-branch: true
54 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Xcode
2 | #
3 | # gitignore contributors: remember to update Global/Xcode.gitignore, Objective-C.gitignore & Swift.gitignore
4 |
5 | ## User settings
6 | xcuserdata/
7 |
8 | ## Obj-C/Swift specific
9 | *.hmap
10 |
11 | ## App packaging
12 | *.ipa
13 | *.dSYM.zip
14 | *.dSYM
15 |
16 | ## Playgrounds
17 | timeline.xctimeline
18 | playground.xcworkspace
19 |
20 | # Swift Package Manager
21 | #
22 | # Add this line if you want to avoid checking in source code from Swift Package Manager dependencies.
23 | # Packages/
24 | # Package.pins
25 | # Package.resolved
26 | # *.xcodeproj
27 | #
28 | # Xcode automatically generates this directory with a .xcworkspacedata file and xcuserdata
29 | # hence it is not needed unless you have added a package configuration file to your project
30 | # .swiftpm
31 |
32 | .build/
33 |
34 | # CocoaPods
35 | #
36 | # We recommend against adding the Pods directory to your .gitignore. However
37 | # you should judge for yourself, the pros and cons are mentioned at:
38 | # https://guides.cocoapods.org/using/using-cocoapods.html#should-i-check-the-pods-directory-into-source-control
39 | #
40 | # Pods/
41 | #
42 | # Add this line if you want to avoid checking in source code from the Xcode workspace
43 | # *.xcworkspace
44 |
45 | # Carthage
46 | #
47 | # Add this line if you want to avoid checking in source code from Carthage dependencies.
48 | # Carthage/Checkouts
49 |
50 | Carthage/Build/
51 |
52 | # fastlane
53 | #
54 | # It is recommended to not store the screenshots in the git repo.
55 | # Instead, use fastlane to re-generate the screenshots whenever they are needed.
56 | # For more information about the recommended setup visit:
57 | # https://docs.fastlane.tools/best-practices/source-control/#source-control
58 |
59 | fastlane/report.xml
60 | fastlane/Preview.html
61 | fastlane/screenshots/**/*.png
62 | fastlane/test_output
63 |
--------------------------------------------------------------------------------
/.swiftpm/xcode/package.xcworkspace/contents.xcworkspacedata:
--------------------------------------------------------------------------------
1 |
2 |
4 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.swiftpm/xcode/xcshareddata/xcschemes/VecturaKit-Package.xcscheme:
--------------------------------------------------------------------------------
1 |
2 |
5 |
9 |
10 |
16 |
22 |
23 |
24 |
30 |
36 |
37 |
38 |
44 |
50 |
51 |
52 |
58 |
64 |
65 |
66 |
67 |
68 |
74 |
75 |
77 |
83 |
84 |
85 |
87 |
93 |
94 |
95 |
96 |
97 |
107 |
108 |
114 |
115 |
116 |
117 |
123 |
124 |
130 |
131 |
132 |
133 |
135 |
136 |
139 |
140 |
141 |
--------------------------------------------------------------------------------
/.swiftpm/xcode/xcshareddata/xcschemes/VecturaKit.xcscheme:
--------------------------------------------------------------------------------
1 |
2 |
5 |
9 |
10 |
16 |
22 |
23 |
24 |
25 |
26 |
32 |
33 |
35 |
41 |
42 |
43 |
44 |
45 |
55 |
56 |
62 |
63 |
69 |
70 |
71 |
72 |
74 |
75 |
78 |
79 |
80 |
--------------------------------------------------------------------------------
/.swiftpm/xcode/xcshareddata/xcschemes/VecturaKitTests.xcscheme:
--------------------------------------------------------------------------------
1 |
2 |
5 |
9 |
10 |
16 |
17 |
19 |
25 |
26 |
27 |
28 |
29 |
39 |
40 |
46 |
47 |
49 |
50 |
53 |
54 |
55 |
--------------------------------------------------------------------------------
/.swiftpm/xcode/xcshareddata/xcschemes/VecturaMLXKit.xcscheme:
--------------------------------------------------------------------------------
1 |
2 |
5 |
9 |
10 |
16 |
22 |
23 |
24 |
25 |
26 |
32 |
33 |
43 |
44 |
50 |
51 |
57 |
58 |
59 |
60 |
62 |
63 |
66 |
67 |
68 |
--------------------------------------------------------------------------------
/.swiftpm/xcode/xcshareddata/xcschemes/VecturaMLXKitTests.xcscheme:
--------------------------------------------------------------------------------
1 |
2 |
5 |
9 |
10 |
16 |
17 |
19 |
25 |
26 |
27 |
28 |
29 |
39 |
40 |
46 |
47 |
49 |
50 |
53 |
54 |
55 |
--------------------------------------------------------------------------------
/.swiftpm/xcode/xcshareddata/xcschemes/vectura-cli.xcscheme:
--------------------------------------------------------------------------------
1 |
2 |
5 |
9 |
10 |
16 |
22 |
23 |
24 |
25 |
26 |
32 |
33 |
43 |
45 |
51 |
52 |
53 |
54 |
60 |
62 |
68 |
69 |
70 |
71 |
73 |
74 |
77 |
78 |
79 |
--------------------------------------------------------------------------------
/.swiftpm/xcode/xcshareddata/xcschemes/vectura-mlx-cli.xcscheme:
--------------------------------------------------------------------------------
1 |
2 |
5 |
9 |
10 |
16 |
22 |
23 |
24 |
25 |
26 |
32 |
33 |
43 |
45 |
51 |
52 |
53 |
54 |
57 |
58 |
59 |
60 |
66 |
68 |
74 |
75 |
76 |
77 |
79 |
80 |
83 |
84 |
85 |
--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
1 | {
2 | "configurations": [
3 | {
4 | "type": "swift",
5 | "request": "launch",
6 | "args": [],
7 | "cwd": "${workspaceFolder:VecturaKit}",
8 | "name": "Debug vectura-cli",
9 | "program": "${workspaceFolder:VecturaKit}/.build/debug/vectura-cli",
10 | "preLaunchTask": "swift: Build Debug vectura-cli"
11 | },
12 | {
13 | "type": "swift",
14 | "request": "launch",
15 | "args": [],
16 | "cwd": "${workspaceFolder:VecturaKit}",
17 | "name": "Release vectura-cli",
18 | "program": "${workspaceFolder:VecturaKit}/.build/release/vectura-cli",
19 | "preLaunchTask": "swift: Build Release vectura-cli"
20 | },
21 | {
22 | "type": "swift",
23 | "request": "launch",
24 | "args": [],
25 | "cwd": "${workspaceFolder:VecturaKit}",
26 | "name": "Debug vectura-mlx-cli",
27 | "program": "${workspaceFolder:VecturaKit}/.build/debug/vectura-mlx-cli",
28 | "preLaunchTask": "swift: Build Debug vectura-mlx-cli"
29 | },
30 | {
31 | "type": "swift",
32 | "request": "launch",
33 | "args": [],
34 | "cwd": "${workspaceFolder:VecturaKit}",
35 | "name": "Release vectura-mlx-cli",
36 | "program": "${workspaceFolder:VecturaKit}/.build/release/vectura-mlx-cli",
37 | "preLaunchTask": "swift: Build Release vectura-mlx-cli"
38 | }
39 | ]
40 | }
--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {}
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2025 Rudrank Riyam
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/Package.resolved:
--------------------------------------------------------------------------------
1 | {
2 | "originHash" : "6db3b74627697ac586e400fabd67629791185e08ed51fb68cf82ab175e2330c2",
3 | "pins" : [
4 | {
5 | "identity" : "gzipswift",
6 | "kind" : "remoteSourceControl",
7 | "location" : "https://github.com/1024jp/GzipSwift",
8 | "state" : {
9 | "revision" : "731037f6cc2be2ec01562f6597c1d0aa3fe6fd05",
10 | "version" : "6.0.1"
11 | }
12 | },
13 | {
14 | "identity" : "jinja",
15 | "kind" : "remoteSourceControl",
16 | "location" : "https://github.com/johnmai-dev/Jinja",
17 | "state" : {
18 | "revision" : "bbddb92fc51ae420b87300298370fd1dfc308f73",
19 | "version" : "1.1.1"
20 | }
21 | },
22 | {
23 | "identity" : "mlx-swift",
24 | "kind" : "remoteSourceControl",
25 | "location" : "https://github.com/ml-explore/mlx-swift",
26 | "state" : {
27 | "revision" : "70dbb62128a5a1471a5ab80363430adb33470cab",
28 | "version" : "0.21.2"
29 | }
30 | },
31 | {
32 | "identity" : "mlx-swift-examples",
33 | "kind" : "remoteSourceControl",
34 | "location" : "https://github.com/ml-explore/mlx-swift-examples/",
35 | "state" : {
36 | "branch" : "main",
37 | "revision" : "cb66b4bc6bc1a69663837881e7f1260cd49d6b59"
38 | }
39 | },
40 | {
41 | "identity" : "swift-argument-parser",
42 | "kind" : "remoteSourceControl",
43 | "location" : "https://github.com/apple/swift-argument-parser.git",
44 | "state" : {
45 | "revision" : "0fbc8848e389af3bb55c182bc19ca9d5dc2f255b",
46 | "version" : "1.4.0"
47 | }
48 | },
49 | {
50 | "identity" : "swift-collections",
51 | "kind" : "remoteSourceControl",
52 | "location" : "https://github.com/apple/swift-collections.git",
53 | "state" : {
54 | "revision" : "671108c96644956dddcd89dd59c203dcdb36cec7",
55 | "version" : "1.1.4"
56 | }
57 | },
58 | {
59 | "identity" : "swift-embeddings",
60 | "kind" : "remoteSourceControl",
61 | "location" : "https://github.com/jkrukowski/swift-embeddings.git",
62 | "state" : {
63 | "revision" : "419c52ea50238435218c587e3bebfe290ee91287",
64 | "version" : "0.0.13"
65 | }
66 | },
67 | {
68 | "identity" : "swift-numerics",
69 | "kind" : "remoteSourceControl",
70 | "location" : "https://github.com/apple/swift-numerics",
71 | "state" : {
72 | "revision" : "0a5bc04095a675662cf24757cc0640aa2204253b",
73 | "version" : "1.0.2"
74 | }
75 | },
76 | {
77 | "identity" : "swift-safetensors",
78 | "kind" : "remoteSourceControl",
79 | "location" : "https://github.com/jkrukowski/swift-safetensors.git",
80 | "state" : {
81 | "revision" : "718b0f38f912e0bf9d92130fa1e1fe2ae5136dd6",
82 | "version" : "0.0.7"
83 | }
84 | },
85 | {
86 | "identity" : "swift-sentencepiece",
87 | "kind" : "remoteSourceControl",
88 | "location" : "https://github.com/jkrukowski/swift-sentencepiece",
89 | "state" : {
90 | "revision" : "36a8b2b45733f6adb3092100f16e4c7d38a10a7c",
91 | "version" : "0.0.6"
92 | }
93 | },
94 | {
95 | "identity" : "swift-transformers",
96 | "kind" : "remoteSourceControl",
97 | "location" : "https://github.com/huggingface/swift-transformers",
98 | "state" : {
99 | "revision" : "be855fac725dbae27264e47a3eb535cc422a4ba8",
100 | "version" : "0.1.18"
101 | }
102 | }
103 | ],
104 | "version" : 3
105 | }
106 |
--------------------------------------------------------------------------------
/Package.swift:
--------------------------------------------------------------------------------
1 | // swift-tools-version: 6.0
2 | // The swift-tools-version declares the minimum version of Swift required to build this package.
3 |
4 | import PackageDescription
5 |
6 | let package = Package(
7 | name: "VecturaKit",
8 | platforms: [
9 | .macOS(.v14),
10 | .iOS(.v17),
11 | .tvOS(.v17),
12 | .visionOS(.v1),
13 | .watchOS(.v10),
14 | ],
15 | products: [
16 | .library(
17 | name: "VecturaKit",
18 | targets: ["VecturaKit"]
19 | ),
20 | .library(
21 | name: "VecturaMLXKit",
22 | targets: ["VecturaMLXKit"]
23 | ),
24 | .executable(
25 | name: "vectura-cli",
26 | targets: ["VecturaCLI"]
27 | ),
28 | .executable(
29 | name: "vectura-mlx-cli",
30 | targets: ["VecturaMLXCLI"]
31 | ),
32 | ],
33 | dependencies: [
34 | .package(url: "https://github.com/jkrukowski/swift-embeddings.git", from: "0.0.10"),
35 | .package(url: "https://github.com/apple/swift-argument-parser.git", from: "1.4.0"),
36 | .package(url: "https://github.com/ml-explore/mlx-swift-examples/", branch: "main"),
37 | ],
38 | targets: [
39 | .target(
40 | name: "VecturaKit",
41 | dependencies: [
42 | .product(name: "Embeddings", package: "swift-embeddings")
43 | ],
44 | cSettings: [
45 | .define("ACCELERATE_NEW_LAPACK"),
46 | .define("ACCELERATE_LAPACK_ILP64"),
47 | ]
48 | ),
49 | .target(
50 | name: "VecturaMLXKit",
51 | dependencies: [
52 | "VecturaKit",
53 | .product(name: "MLXEmbedders", package: "mlx-swift-examples"),
54 | ]
55 | ),
56 | .executableTarget(
57 | name: "VecturaCLI",
58 | dependencies: [
59 | "VecturaKit",
60 | .product(name: "ArgumentParser", package: "swift-argument-parser"),
61 | ]
62 | ),
63 | .executableTarget(
64 | name: "VecturaMLXCLI",
65 | dependencies: [
66 | "VecturaMLXKit",
67 | .product(name: "ArgumentParser", package: "swift-argument-parser"),
68 | ]
69 | ),
70 | .testTarget(
71 | name: "VecturaKitTests",
72 | dependencies: ["VecturaKit"]
73 | ),
74 | .testTarget(
75 | name: "VecturaMLXKitTests",
76 | dependencies: ["VecturaMLXKit"]
77 | ),
78 | ]
79 | )
80 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # VecturaKit
2 |
3 | VecturaKit is a Swift-based vector database designed for on-device applications, enabling advanced user experiences through local vector storage and retrieval. Inspired by [Dripfarm's SVDB](https://github.com/Dripfarm/SVDB), **VecturaKit** utilizes `MLTensor` and [`swift-embeddings`](https://github.com/jkrukowski/swift-embeddings) for generating and managing embeddings. The framework offers two primary modules: `VecturaKit`, which supports diverse embedding models via `swift-embeddings`, and `VecturaMLXKit`, which leverages Apple's MLX framework for accelerated processing.
4 |
5 | ## Support
6 |
7 | Love this project? Check out my books to explore more of AI and iOS development:
8 | - [Exploring AI for iOS Development](https://academy.rudrank.com/product/ai)
9 | - [Exploring AI-Assisted Coding for iOS Development](https://academy.rudrank.com/product/ai-assisted-coding)
10 |
11 | Your support helps to keep this project growing!
12 |
13 | ## Key Features
14 |
15 | - **On-Device Storage:** Stores and manages vector embeddings locally, enhancing privacy and reducing latency.
16 | - **Hybrid Search:** Combines vector similarity with BM25 text search for comprehensive and relevant search results (`VecturaKit`).
17 | - **Batch Processing:** Indexes documents in parallel for faster data ingestion.
18 | - **Persistent Storage:** Automatically saves and loads document data, preserving the database state across app sessions.
19 | - **Configurable Search:** Customizes search behavior with adjustable thresholds, result limits, and hybrid search weights.
20 | - **Custom Storage Location:** Specifies a custom directory for database storage.
21 | - **MLX Support:** Employs Apple's MLX framework for accelerated embedding generation and search operations (`VecturaMLXKit`).
22 | - **CLI Tool:** Includes a command-line interface (CLI) for database management, testing, and debugging for both `VecturaKit` and `VecturaMLXKit`.
23 |
24 | ## Supported Platforms
25 |
26 | - macOS 14.0 or later
27 | - iOS 17.0 or later
28 | - tvOS 17.0 or later
29 | - visionOS 1.0 or later
30 | - watchOS 10.0 or later
31 |
32 | ## Installation
33 |
34 | ### Swift Package Manager
35 |
36 | To integrate VecturaKit into your project using Swift Package Manager, add the following dependency in your `Package.swift` file:
37 |
38 | ```swift
39 | dependencies: [
40 | .package(url: "https://github.com/rryam/VecturaKit.git", branch: "main"),
41 | ],
42 | ```
43 |
44 | ### Dependencies
45 |
46 | VecturaKit relies on the following Swift packages:
47 |
48 | - [swift-embeddings](https://github.com/jkrukowski/swift-embeddings): Used in `VecturaKit` for generating text embeddings using various models.
49 | - [swift-argument-parser](https://github.com/apple/swift-argument-parser): Used for creating the command-line interface.
50 | - [mlx-swift-examples](https://github.com/ml-explore/mlx-swift-examples): Provides MLX-based embeddings and vector search capabilities, specifically for `VecturaMLXKit`.
51 |
52 | ## Usage
53 |
54 | ### Core VecturaKit
55 |
56 | 1. **Import VecturaKit**
57 |
58 | ```swift
59 | import VecturaKit
60 | ```
61 |
62 | 2. **Create Configuration and Initialize Database**
63 |
64 | ```swift
65 | import Foundation
66 | import VecturaKit
67 |
68 | let config = VecturaConfig(
69 | name: "my-vector-db",
70 | directoryURL: nil, // Optional custom storage location
71 | dimension: 384, // Matches the default BERT model dimension
72 | searchOptions: VecturaConfig.SearchOptions(
73 | defaultNumResults: 10,
74 | minThreshold: 0.7,
75 | hybridWeight: 0.5, // Balance between vector and text search
76 | k1: 1.2, // BM25 parameters
77 | b: 0.75
78 | )
79 | )
80 |
81 | let vectorDB = try await VecturaKit(config: config)
82 | ```
83 |
84 | 3. **Add Documents**
85 |
86 | Single document:
87 |
88 | ```swift
89 | let text = "Sample text to be embedded"
90 | let documentId = try await vectorDB.addDocument(
91 | text: text,
92 | id: UUID(), // Optional, will be generated if not provided
93 | model: .id("sentence-transformers/all-MiniLM-L6-v2") // Optional, this is the default
94 | )
95 | ```
96 |
97 | Multiple documents in batch:
98 |
99 | ```swift
100 | let texts = [
101 | "First document text",
102 | "Second document text",
103 | "Third document text"
104 | ]
105 | let documentIds = try await vectorDB.addDocuments(
106 | texts: texts,
107 | ids: nil, // Optional array of UUIDs
108 | model: .id("sentence-transformers/all-MiniLM-L6-v2") // Optional model
109 | )
110 | ```
111 |
112 | 4. **Search Documents**
113 |
114 | Search by text (hybrid search):
115 |
116 | ```swift
117 | let results = try await vectorDB.search(
118 | query: "search query",
119 | numResults: 5, // Optional
120 | threshold: 0.8, // Optional
121 | model: .id("sentence-transformers/all-MiniLM-L6-v2") // Optional
122 | )
123 |
124 | for result in results {
125 | print("Document ID: \(result.id)")
126 | print("Text: \(result.text)")
127 | print("Similarity Score: \(result.score)")
128 | print("Created At: \(result.createdAt)")
129 | }
130 | ```
131 |
132 | Search by vector embedding:
133 |
134 | ```swift
135 | let results = try await vectorDB.search(
136 | query: embeddingArray, // [Float] matching config.dimension
137 | numResults: 5, // Optional
138 | threshold: 0.8 // Optional
139 | )
140 | ```
141 |
142 | 5. **Document Management**
143 |
144 | Update document:
145 |
146 | ```swift
147 | try await vectorDB.updateDocument(
148 | id: documentId,
149 | newText: "Updated text",
150 | model: .id("sentence-transformers/all-MiniLM-L6-v2") // Optional
151 | )
152 | ```
153 |
154 | Delete documents:
155 |
156 | ```swift
157 | try await vectorDB.deleteDocuments(ids: [documentId1, documentId2])
158 | ```
159 |
160 | Reset database:
161 |
162 | ```swift
163 | try await vectorDB.reset()
164 | ```
165 |
166 | ### VecturaMLXKit (MLX Version)
167 |
168 | VecturaMLXKit harnesses Apple's MLX framework for accelerated processing, delivering optimized performance for on-device machine learning tasks.
169 |
170 | 1. **Import VecturaMLXKit**
171 |
172 | ```swift
173 | import VecturaMLXKit
174 | ```
175 |
176 | 2. **Initialize Database**
177 |
178 | ```swift
179 | import VecturaMLXKit
180 | import MLXEmbedders
181 |
182 | let config = VecturaConfig(
183 | name: "my-mlx-vector-db",
184 | dimension: 768 // nomic_text_v1_5 model outputs 768-dimensional embeddings
185 | )
186 | let vectorDB = try await VecturaMLXKit(config: config, modelConfiguration: .nomic_text_v1_5)
187 | ```
188 |
189 | 3. **Add Documents**
190 |
191 | ```swift
192 | let texts = [
193 | "First document text",
194 | "Second document text",
195 | "Third document text"
196 | ]
197 | let documentIds = try await vectorDB.addDocuments(texts: texts)
198 | ```
199 |
200 | 4. **Search Documents**
201 |
202 | ```swift
203 | let results = try await vectorDB.search(
204 | query: "search query",
205 | numResults: 5, // Optional
206 | threshold: 0.8 // Optional
207 | )
208 |
209 | for result in results {
210 | print("Document ID: \(result.id)")
211 | print("Text: \(result.text)")
212 | print("Similarity Score: \(result.score)")
213 | print("Created At: \(result.createdAt)")
214 | }
215 | ```
216 |
217 | 5. **Document Management**
218 |
219 | Update document:
220 |
221 | ```swift
222 | try await vectorDB.updateDocument(
223 | id: documentId,
224 | newText: "Updated text"
225 | )
226 | ```
227 |
228 | Delete documents:
229 |
230 | ```swift
231 | try await vectorDB.deleteDocuments(ids: [documentId1, documentId2])
232 | ```
233 |
234 | Reset database:
235 |
236 | ```swift
237 | try await vectorDB.reset()
238 | ```
239 |
240 | ## Command Line Interface
241 |
242 | VecturaKit includes a command-line interface for both the standard and MLX versions, facilitating easy database management.
243 |
244 | **Standard CLI Tool**
245 |
246 | ```bash
247 | # Add documents
248 | vectura add "First document" "Second document" "Third document" \
249 | --db-name "my-vector-db" \
250 | --dimension 384 \
251 | --model-id "sentence-transformers/all-MiniLM-L6-v2"
252 |
253 | # Search documents
254 | vectura search "search query" \
255 | --db-name "my-vector-db" \
256 | --dimension 384 \
257 | --threshold 0.7 \
258 | --num-results 5 \
259 | --model-id "sentence-transformers/all-MiniLM-L6-v2"
260 |
261 | # Update document
262 | vectura update "Updated text content" \
263 | --db-name "my-vector-db" \
264 | --dimension 384 \
265 | --model-id "sentence-transformers/all-MiniLM-L6-v2"
266 |
267 | # Delete documents
268 | vectura delete \
269 | --db-name "my-vector-db" \
270 | --dimension 384
271 |
272 | # Reset database
273 | vectura reset \
274 | --db-name "my-vector-db" \
275 | --dimension 384
276 |
277 | # Run demo with sample data
278 | vectura mock \
279 | --db-name "my-vector-db" \
280 | --dimension 384 \
281 | --threshold 0.7 \
282 | --num-results 10 \
283 | --model-id "sentence-transformers/all-MiniLM-L6-v2"
284 | ```
285 |
286 | Common options:
287 |
288 | - `--db-name, -d`: Database name (default: "vectura-cli-db")
289 | - `--dimension, -v`: Vector dimension (default: 384)
290 | - `--threshold, -t`: Minimum similarity threshold (default: 0.7)
291 | - `--num-results, -n`: Number of results to return (default: 10)
292 | - `--model-id, -m`: Model ID for embeddings (default: "sentence-transformers/all-MiniLM-L6-v2")
293 |
294 | **MLX CLI Tool**
295 |
296 | ```bash
297 | # Add documents
298 | vectura-mlx add "First document" "Second document" "Third document" --db-name "my-mlx-vector-db"
299 |
300 | # Search documents
301 | vectura-mlx search "search query" --db-name "my-mlx-vector-db" --threshold 0.7 --num-results 5
302 |
303 | # Update document
304 | vectura-mlx update "Updated text content" --db-name "my-mlx-vector-db"
305 |
306 | # Delete documents
307 | vectura-mlx delete --db-name "my-mlx-vector-db"
308 |
309 | # Reset database
310 | vectura-mlx reset --db-name "my-mlx-vector-db"
311 |
312 | # Run demo with sample data
313 | vectura-mlx mock --db-name "my-mlx-vector-db"
314 | ```
315 |
316 | ## License
317 |
318 | VecturaKit is released under the MIT License. See the [LICENSE](LICENSE) file for more information. Copyright (c) 2025 Rudrank Riyam.
319 |
320 | ## Contributing
321 |
322 | Contributions are welcome! Please fork the repository and submit a pull request with your improvements.
323 |
324 | ### Development
325 |
326 | The project is structured as a Swift Package. It includes the following key targets:
327 |
328 | - `VecturaKit`: The core vector database library.
329 | - `VecturaMLXKit`: The MLX-accelerated version of the library.
330 | - `vectura-cli`: The command-line interface for `VecturaKit`.
331 | - `vectura-mlx-cli`: The command-line interface for `VecturaMLXKit`.
332 |
333 | To build and test the project, use the following commands:
334 |
335 | ```bash
336 | swift build
337 | swift test
338 | ```
339 |
340 | The project also includes CI workflows defined in `.github/workflows` to automate building and testing on pull requests and pushes to the `main` branch. The workflows require Xcode 16.1 and Swift 6.0.
341 |
342 | Debugging configurations are provided in `.vscode/launch.json` for the `vectura-cli`. These can be used to launch the CLI with the debugger attached.
343 |
344 | ### Continuous Integration
345 |
346 | The project uses GitHub Actions for continuous integration. The following workflows are defined:
347 |
348 | - `.github/workflows/build_and_test_mlx.yml`: Builds and tests the `VecturaMLXKit` target.
349 | - `.github/workflows/build_and_test_vectura.yml`: Builds and tests the `VecturaKit` and `vectura-cli` targets.
350 | - `.github/workflows/update-readme.yml`: Automatically updates the `README.md` file using a Python script that calls the Gemini AI model. This workflow is triggered on pushes to the `main` branch and creates a pull request with the updated README.
351 |
--------------------------------------------------------------------------------
/Sources/VecturaCLI/VecturaCLI.swift:
--------------------------------------------------------------------------------
1 | import ArgumentParser
2 | import Foundation
3 | import VecturaKit
4 |
5 | @available(macOS 15.0, iOS 18.0, tvOS 18.0, visionOS 2.0, watchOS 11.0, *)
6 | @main
7 | struct VecturaCLI: AsyncParsableCommand {
8 | struct DocumentID: ExpressibleByArgument, Decodable {
9 | let uuid: UUID
10 |
11 | init(_ uuid: UUID) {
12 | self.uuid = uuid
13 | }
14 |
15 | init?(argument: String) {
16 | guard let uuid = UUID(uuidString: argument) else { return nil }
17 | self.uuid = uuid
18 | }
19 | }
20 |
21 | static let configuration = CommandConfiguration(
22 | commandName: "vectura",
23 | abstract: "A CLI tool for VecturaKit vector database",
24 | subcommands: [Add.self, Search.self, Update.self, Delete.self, Reset.self, Mock.self]
25 | )
26 |
27 | static func setupDB(dbName: String, dimension: Int, numResults: Int, threshold: Float) async throws
28 | -> VecturaKit
29 | {
30 | let config = VecturaConfig(
31 | name: dbName,
32 | dimension: dimension,
33 | searchOptions: VecturaConfig.SearchOptions(
34 | defaultNumResults: numResults,
35 | minThreshold: threshold
36 | )
37 | )
38 | return try await VecturaKit(config: config)
39 | }
40 | }
41 |
42 | @available(macOS 15.0, iOS 18.0, tvOS 18.0, visionOS 2.0, watchOS 11.0, *)
43 | extension VecturaCLI {
44 | struct Mock: AsyncParsableCommand {
45 | static let configuration = CommandConfiguration(
46 | abstract: "Run a mock demonstration with sample data"
47 | )
48 |
49 | @Option(name: [.long, .customShort("d")], help: "Database name")
50 | var dbName: String = "vectura-cli-db"
51 |
52 | @Option(name: [.long, .customShort("v")], help: "Vector dimension")
53 | var dimension: Int = 384
54 |
55 | @Option(name: [.long, .customShort("t")], help: "Minimum similarity threshold")
56 | var threshold: Float = 0.7
57 |
58 | @Option(name: [.long, .customShort("n")], help: "Number of results to return")
59 | var numResults: Int = 10
60 |
61 | @Option(name: [.long, .customShort("m")], help: "Model ID for embeddings")
62 | var modelId: String = "sentence-transformers/all-MiniLM-L6-v2"
63 |
64 | mutating func run() async throws {
65 | let db = try await VecturaCLI.setupDB(
66 | dbName: dbName,
67 | dimension: dimension,
68 | numResults: numResults,
69 | threshold: threshold
70 | )
71 |
72 | // First, reset the database
73 | print("\n🧹 Resetting database...")
74 | try await db.reset()
75 |
76 | // Add sample documents
77 | print("\n📝 Adding sample documents...")
78 | let sampleTexts = [
79 | "The quick brown fox jumps over the lazy dog",
80 | "To be or not to be, that is the question",
81 | "All that glitters is not gold",
82 | "A journey of a thousand miles begins with a single step",
83 | "Where there's smoke, there's fire",
84 | ]
85 |
86 | let ids = try await db.addDocuments(texts: sampleTexts, modelId: modelId)
87 | print("Added \(ids.count) documents:")
88 | for (id, text) in zip(ids, sampleTexts) {
89 | print("ID: \(id)")
90 | print("Text: \(text)")
91 | print("---")
92 | }
93 |
94 | // Search for documents
95 | print("\n🔍 Searching for 'journey'...")
96 | let results = try await db.search(
97 | query: "journey",
98 | numResults: numResults,
99 | threshold: threshold,
100 | modelId: modelId
101 | )
102 |
103 | print("Found \(results.count) results:")
104 | for result in results {
105 | print("ID: \(result.id)")
106 | print("Text: \(result.text)")
107 | print("Score: \(result.score)")
108 | print("Created: \(result.createdAt)")
109 | print("---")
110 | }
111 |
112 | // Update a document
113 | if let firstId = ids.first {
114 | print("\n✏️ Updating first document...")
115 | let newText = "The quick red fox jumps over the sleeping dog"
116 | try await db.updateDocument(id: firstId, newText: newText, modelId: modelId)
117 | print("Updated document \(firstId) with new text: \(newText)")
118 | }
119 |
120 | // Delete last document
121 | if let lastId = ids.last {
122 | print("\n🗑️ Deleting last document...")
123 | try await db.deleteDocuments(ids: [lastId])
124 | print("Deleted document \(lastId)")
125 | }
126 |
127 | print("\n✨ Mock demonstration completed!")
128 | }
129 | }
130 |
131 | struct Add: AsyncParsableCommand {
132 | static let configuration = CommandConfiguration(
133 | abstract: "Add documents to the vector database"
134 | )
135 |
136 | @Option(name: [.long, .customShort("d")], help: "Database name")
137 | var dbName: String = "vectura-cli-db"
138 |
139 | @Option(name: [.long, .customShort("v")], help: "Vector dimension")
140 | var dimension: Int = 384
141 |
142 | @Option(name: [.long, .customShort("m")], help: "Model ID for embeddings")
143 | var modelId: String = "sentence-transformers/all-MiniLM-L6-v2"
144 |
145 | @Argument(help: "Text content to add")
146 | var text: [String]
147 |
148 | mutating func run() async throws {
149 | let db = try await VecturaCLI.setupDB(
150 | dbName: dbName,
151 | dimension: dimension,
152 | numResults: 10,
153 | threshold: 0.7
154 | )
155 | let ids = try await db.addDocuments(texts: text, modelId: modelId)
156 | print("Added \(ids.count) documents:")
157 | for (id, text) in zip(ids, text) {
158 | print("ID: \(id)")
159 | print("Text: \(text)")
160 | print("---")
161 | }
162 | }
163 | }
164 |
165 | struct Search: AsyncParsableCommand {
166 | static let configuration = CommandConfiguration(
167 | abstract: "Search documents in the vector database"
168 | )
169 |
170 | @Option(name: [.long, .customShort("d")], help: "Database name")
171 | var dbName: String = "vectura-cli-db"
172 |
173 | @Option(name: [.long, .customShort("v")], help: "Vector dimension")
174 | var dimension: Int = 384
175 |
176 | @Option(name: [.long, .customShort("t")], help: "Minimum similarity threshold")
177 | var threshold: Float = 0.7
178 |
179 | @Option(name: [.long, .customShort("n")], help: "Number of results to return")
180 | var numResults: Int = 10
181 |
182 | @Option(name: [.long, .customShort("m")], help: "Model ID for embeddings")
183 | var modelId: String = "sentence-transformers/all-MiniLM-L6-v2"
184 |
185 | @Argument(help: "Search query")
186 | var query: String
187 |
188 | mutating func run() async throws {
189 | let db = try await VecturaCLI.setupDB(
190 | dbName: dbName,
191 | dimension: dimension,
192 | numResults: numResults,
193 | threshold: threshold
194 | )
195 | let results = try await db.search(
196 | query: query,
197 | numResults: numResults,
198 | threshold: threshold,
199 | modelId: modelId
200 | )
201 |
202 | print("Found \(results.count) results:")
203 | for result in results {
204 | print("ID: \(result.id)")
205 | print("Text: \(result.text)")
206 | print("Score: \(result.score)")
207 | print("Created: \(result.createdAt)")
208 | print("---")
209 | }
210 | }
211 | }
212 |
213 | struct Update: AsyncParsableCommand, Decodable {
214 | static let configuration = CommandConfiguration(
215 | abstract: "Update a document in the vector database"
216 | )
217 |
218 | @Option(name: [.long, .customShort("d")], help: "Database name")
219 | var dbName: String = "vectura-cli-db"
220 |
221 | @Option(name: [.long, .customShort("v")], help: "Vector dimension")
222 | var dimension: Int = 384
223 |
224 | @Option(name: [.long, .customShort("m")], help: "Model ID for embeddings")
225 | var modelId: String = "sentence-transformers/all-MiniLM-L6-v2"
226 |
227 | @Argument(help: "Document ID to update")
228 | var id: DocumentID
229 |
230 | @Argument(help: "New text content")
231 | var newText: String
232 |
233 | mutating func run() async throws {
234 | let db = try await VecturaCLI.setupDB(
235 | dbName: dbName,
236 | dimension: dimension,
237 | numResults: 10,
238 | threshold: 0.7
239 | )
240 | try await db.updateDocument(id: id.uuid, newText: newText, modelId: modelId)
241 | print("Updated document \(id.uuid) with new text: \(newText)")
242 | }
243 | }
244 |
245 | struct Delete: AsyncParsableCommand, Decodable {
246 | static let configuration = CommandConfiguration(
247 | abstract: "Delete documents from the vector database"
248 | )
249 |
250 | @Option(name: [.long, .customShort("d")], help: "Database name")
251 | var dbName: String = "vectura-cli-db"
252 |
253 | @Option(name: [.long, .customShort("v")], help: "Vector dimension")
254 | var dimension: Int = 384
255 |
256 | @Argument(help: "Document IDs to delete")
257 | var ids: [DocumentID]
258 |
259 | mutating func run() async throws {
260 | let db = try await VecturaCLI.setupDB(
261 | dbName: dbName,
262 | dimension: dimension,
263 | numResults: 10,
264 | threshold: 0.7
265 | )
266 | try await db.deleteDocuments(ids: ids.map(\.uuid))
267 | print("Deleted \(ids.count) documents")
268 | }
269 | }
270 |
271 | struct Reset: AsyncParsableCommand {
272 | static let configuration = CommandConfiguration(
273 | abstract: "Reset the vector database"
274 | )
275 |
276 | @Option(name: [.long, .customShort("d")], help: "Database name")
277 | var dbName: String = "vectura-cli-db"
278 |
279 | @Option(name: [.long, .customShort("v")], help: "Vector dimension")
280 | var dimension: Int = 384
281 |
282 | mutating func run() async throws {
283 | let db = try await VecturaCLI.setupDB(
284 | dbName: dbName,
285 | dimension: dimension,
286 | numResults: 10,
287 | threshold: 0.7
288 | )
289 | try await db.reset()
290 | print("Database reset successfully")
291 | }
292 | }
293 | }
294 |
--------------------------------------------------------------------------------
/Sources/VecturaKit/BM25Index.swift:
--------------------------------------------------------------------------------
1 | //
2 | // BM25Index.swift
3 | // VecturaKit
4 | //
5 | // Created by Rudrank Riyam on 1/19/25.
6 | //
7 |
8 | import Foundation
9 |
10 | private func tokenize(_ text: String) -> [String] {
11 | text.lowercased()
12 | .folding(options: .diacriticInsensitive, locale: .current)
13 | .components(separatedBy: CharacterSet.alphanumerics.inverted)
14 | .filter { !$0.isEmpty }
15 | }
16 |
17 | /// An index for BM25-based text search over VecturaDocuments
18 | public struct BM25Index {
19 | private let k1: Float
20 | private let b: Float
21 | private var documents: [VecturaDocument]
22 | private var documentFrequencies: [String: Int]
23 | private var documentLengths: [UUID: Int]
24 | private var averageDocumentLength: Float
25 |
26 | /// Creates a new BM25 index for the given documents
27 | ///
28 | /// - Parameters:
29 | /// - documents: The documents to index
30 | /// - k1: BM25 k1 parameter (default: 1.2)
31 | /// - b: BM25 b parameter (default: 0.75)
32 | public init(documents: [VecturaDocument], k1: Float = 1.2, b: Float = 0.75) {
33 | self.k1 = k1
34 | self.b = b
35 | self.documents = documents
36 | self.documentFrequencies = [:]
37 |
38 | self.documentLengths = documents.reduce(into: [:]) { dict, doc in
39 | dict[doc.id] = tokenize(doc.text).count
40 | }
41 |
42 | self.averageDocumentLength = Float(documentLengths.values.reduce(0, +)) / Float(documents.count)
43 |
44 | for document in documents {
45 | let terms = Set(tokenize(document.text))
46 | for term in terms {
47 | documentFrequencies[term, default: 0] += 1
48 | }
49 | }
50 | }
51 |
52 | /// Searches the index using BM25 scoring
53 | ///
54 | /// - Parameters:
55 | /// - query: The search query
56 | /// - topK: Maximum number of results to return
57 | /// - Returns: Array of tuples containing documents and their BM25 scores
58 | public func search(query: String, topK: Int = 10) -> [(document: VecturaDocument, score: Float)] {
59 | let queryTerms = tokenize(query)
60 | var scores: [(VecturaDocument, Float)] = []
61 |
62 | for document in documents {
63 | let docLength = Float(documentLengths[document.id] ?? 0)
64 | var score: Float = 0.0
65 |
66 | for term in queryTerms {
67 | let tf = termFrequency(term: term, in: document)
68 | let df = Float(documentFrequencies[term] ?? 0)
69 |
70 | let idf = log((Float(documents.count) - df + 0.5) / (df + 0.5))
71 | let numerator = tf * (k1 + 1)
72 | let denominator = tf + k1 * (1 - b + b * docLength / averageDocumentLength)
73 |
74 | score += idf * (numerator / denominator)
75 | }
76 |
77 | scores.append((document, score))
78 | }
79 |
80 | return scores
81 | .sorted { $0.1 > $1.1 }
82 | .prefix(topK)
83 | .filter { $0.1 > 0 }
84 | }
85 |
86 | /// Add a new document to the index
87 | ///
88 | /// - Parameter document: The document to add
89 | public mutating func addDocument(_ document: VecturaDocument) {
90 | documents.append(document)
91 |
92 | let length = tokenize(document.text).count
93 | documentLengths[document.id] = length
94 |
95 | let terms = Set(tokenize(document.text))
96 | for term in terms {
97 | documentFrequencies[term, default: 0] += 1
98 | }
99 |
100 | let totalLength = documentLengths.values.reduce(0, +)
101 | self.averageDocumentLength = Float(totalLength) / Float(documents.count)
102 | }
103 |
104 | private func termFrequency(term: String, in document: VecturaDocument) -> Float {
105 | Float(
106 | tokenize(document.text)
107 | .filter { $0 == term }
108 | .count)
109 | }
110 | }
111 |
112 | extension VecturaDocument {
113 | /// Calculates a hybrid search score combining vector similarity and BM25
114 | ///
115 | /// - Parameters:
116 | /// - vectorScore: The vector similarity score
117 | /// - bm25Score: The BM25 score
118 | /// - weight: Weight for vector score (0.0-1.0), BM25 weight will be (1-weight)
119 | /// - Returns: Combined score
120 | public func hybridScore(vectorScore: Float, bm25Score: Float, weight: Float = 0.5) -> Float {
121 | let normalizedBM25 = min(max(bm25Score / 10.0, 0), 1)
122 | return weight * vectorScore + (1 - weight) * normalizedBM25
123 | }
124 | }
125 |
--------------------------------------------------------------------------------
/Sources/VecturaKit/FileStorageProvider.swift:
--------------------------------------------------------------------------------
1 | import Foundation
2 | import Accelerate
3 |
4 | /// A file‑based storage provider that implements VecturaStorage using JSON files.
5 | /// This provider maintains an in‑memory cache of documents while persisting them
6 | /// to a specified storage directory.
7 | public class FileStorageProvider: VecturaStorage {
8 | /// The storage directory where JSON files are stored.
9 | private let storageDirectory: URL
10 |
11 | /// In‑memory cache of documents keyed by their UUID.
12 | private var documents: [UUID: VecturaDocument] = [:]
13 |
14 | /// In‑memory cache of normalized embeddings for each document.
15 | private var normalizedEmbeddings: [UUID: [Float]] = [:]
16 |
17 | /// Initializes the provider with the target storage directory.
18 | ///
19 | /// - Parameter storageDirectory: The directory URL where documents will be saved and loaded.
20 | public init(storageDirectory: URL) throws {
21 | self.storageDirectory = storageDirectory
22 |
23 | // Ensure the storage directory exists
24 | if !FileManager.default.fileExists(atPath: storageDirectory.path) {
25 | try FileManager.default.createDirectory(at: storageDirectory, withIntermediateDirectories: true)
26 | }
27 |
28 | // Load any existing documents.
29 | try loadDocumentsFromStorage()
30 | }
31 |
32 | /// Ensures that the storage directory exists.
33 | public func createStorageDirectoryIfNeeded() async throws {
34 | if !FileManager.default.fileExists(atPath: storageDirectory.path) {
35 | try FileManager.default.createDirectory(at: storageDirectory, withIntermediateDirectories: true)
36 | }
37 | }
38 |
39 | /// Loads documents from in‑memory cache.
40 | /// This function returns the documents that were loaded during initialization.
41 | public func loadDocuments() async throws -> [VecturaDocument] {
42 | return Array(documents.values)
43 | }
44 |
45 | /// Saves a document by encoding it to JSON and writing it to disk.
46 | /// It also updates the in‑memory caches for the document and its normalized embedding.
47 | public func saveDocument(_ document: VecturaDocument) async throws {
48 | // Update cache
49 | documents[document.id] = document
50 |
51 | // Encode and write document to disk
52 | let encoder = JSONEncoder()
53 | encoder.outputFormatting = .prettyPrinted
54 | let data = try encoder.encode(document)
55 | let documentURL = storageDirectory.appendingPathComponent("\(document.id).json")
56 | try data.write(to: documentURL)
57 |
58 | // Compute and store normalized embedding
59 | let norm = l2Norm(document.embedding)
60 | var divisor = norm + 1e-9
61 | var normalized = [Float](repeating: 0, count: document.embedding.count)
62 | vDSP_vsdiv(document.embedding, 1, &divisor, &normalized, 1, vDSP_Length(document.embedding.count))
63 | normalizedEmbeddings[document.id] = normalized
64 | }
65 |
66 | /// Deletes a document by removing it from the in‑memory caches and deleting its file.
67 | public func deleteDocument(withID id: UUID) async throws {
68 | // Remove from caches
69 | documents.removeValue(forKey: id)
70 | normalizedEmbeddings.removeValue(forKey: id)
71 |
72 | let documentURL = storageDirectory.appendingPathComponent("\(id).json")
73 | try FileManager.default.removeItem(at: documentURL)
74 | }
75 |
76 | /// Updates an existing document.
77 | /// This is implemented by saving the updated document, which overwrites the existing file.
78 | public func updateDocument(_ document: VecturaDocument) async throws {
79 | try await saveDocument(document)
80 | }
81 |
82 | // MARK: - Private Helper Methods
83 |
84 | /// Loads all JSON‑encoded documents from disk into memory.
85 | private func loadDocumentsFromStorage() throws {
86 | let fileURLs = try FileManager.default.contentsOfDirectory(at: storageDirectory, includingPropertiesForKeys: nil)
87 | let decoder = JSONDecoder()
88 |
89 | for fileURL in fileURLs where fileURL.pathExtension.lowercased() == "json" {
90 | do {
91 | let data = try Data(contentsOf: fileURL)
92 | let doc = try decoder.decode(VecturaDocument.self, from: data)
93 | documents[doc.id] = doc
94 |
95 | // Compute normalized embedding and store it.
96 | let norm = l2Norm(doc.embedding)
97 | var divisor = norm + 1e-9
98 | var normalized = [Float](repeating: 0, count: doc.embedding.count)
99 | vDSP_vsdiv(doc.embedding, 1, &divisor, &normalized, 1, vDSP_Length(doc.embedding.count))
100 | normalizedEmbeddings[doc.id] = normalized
101 | } catch {
102 | // Log the error if needed
103 | print("Failed to load \(fileURL.lastPathComponent): \(error.localizedDescription)")
104 | }
105 | }
106 | }
107 |
108 | /// Computes the L2 norm of a vector.
109 | private func l2Norm(_ vector: [Float]) -> Float {
110 | var sumSquares: Float = 0
111 | vDSP_svesq(vector, 1, &sumSquares, vDSP_Length(vector.count))
112 | return sqrt(sumSquares)
113 | }
114 | }
115 |
--------------------------------------------------------------------------------
/Sources/VecturaKit/VecturaConfig.swift:
--------------------------------------------------------------------------------
1 | import Foundation
2 |
3 | /// Configuration options for Vectura vector database.
4 | public struct VecturaConfig {
5 |
6 | /// The name of the database instance.
7 | public let name: String
8 |
9 | /// A custom directory where the database should be stored.
10 | /// Will be created if it doesn't exist, database contents are stored in a subdirectory named after ``name``.
11 | public let directoryURL: URL?
12 |
13 | /// The dimension of vectors to be stored.
14 | public let dimension: Int
15 |
16 | /// Options for similarity search.
17 | public struct SearchOptions {
18 | /// The default number of results to return.
19 | public var defaultNumResults: Int = 10
20 |
21 | /// The minimum similarity threshold.
22 | public var minThreshold: Float?
23 |
24 | /// Weight for vector similarity in hybrid search (0.0-1.0)
25 | /// BM25 weight will be (1-hybridWeight)
26 | public var hybridWeight: Float = 0.5
27 |
28 | /// BM25 parameters
29 | public var k1: Float = 1.2
30 | public var b: Float = 0.75
31 |
32 | public init(
33 | defaultNumResults: Int = 10,
34 | minThreshold: Float? = nil,
35 | hybridWeight: Float = 0.5,
36 | k1: Float = 1.2,
37 | b: Float = 0.75
38 | ) {
39 | self.defaultNumResults = defaultNumResults
40 | self.minThreshold = minThreshold
41 | self.hybridWeight = hybridWeight
42 | self.k1 = k1
43 | self.b = b
44 | }
45 | }
46 |
47 | /// Search configuration options.
48 | public var searchOptions: SearchOptions
49 |
50 | public init(
51 | name: String,
52 | directoryURL: URL? = nil,
53 | dimension: Int,
54 | searchOptions: SearchOptions = SearchOptions()
55 | ) {
56 | self.name = name
57 | self.directoryURL = directoryURL
58 | self.dimension = dimension
59 | self.searchOptions = searchOptions
60 | }
61 | }
62 |
--------------------------------------------------------------------------------
/Sources/VecturaKit/VecturaDocument.swift:
--------------------------------------------------------------------------------
1 | import Foundation
2 |
3 | /// A document stored in the vector database.
4 | public struct VecturaDocument: Identifiable, Codable, Sendable {
5 | /// The unique identifier of the document.
6 | public let id: UUID
7 |
8 | /// The text content of the document.
9 | public let text: String
10 |
11 | /// The vector embedding of the document.
12 | public let embedding: [Float]
13 |
14 | /// The timestamp when the document was created.
15 | public let createdAt: Date
16 |
17 | /// Creates a new document with the given properties.
18 | /// - Parameters:
19 | /// - id: The unique identifier for the document. If nil, a new UUID will be generated.
20 | /// - text: The text content of the document.
21 | /// - embedding: The vector embedding of the document.
22 | public init(id: UUID? = nil, text: String, embedding: [Float]) {
23 | self.id = id ?? UUID()
24 | self.text = text
25 | self.embedding = embedding
26 | self.createdAt = Date()
27 | }
28 |
29 | // MARK: - Codable
30 | enum CodingKeys: String, CodingKey {
31 | case id, text, embedding, createdAt
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/Sources/VecturaKit/VecturaError.swift:
--------------------------------------------------------------------------------
1 | import Foundation
2 |
3 | /// Errors that can occur when using VecturaKit.
4 | public enum VecturaError: LocalizedError {
5 | /// Thrown when attempting to create a collection that already exists.
6 | case collectionAlreadyExists(String)
7 |
8 | /// Thrown when attempting to access a collection that doesn't exist.
9 | case collectionNotFound(String)
10 |
11 | /// Thrown when vector dimensions don't match.
12 | case dimensionMismatch(expected: Int, got: Int)
13 |
14 | /// Thrown when loading collection data fails.
15 | case loadFailed(String)
16 |
17 | /// Thrown when input validation fails.
18 | case invalidInput(String)
19 |
20 | public var errorDescription: String? {
21 | switch self {
22 | case .collectionAlreadyExists(let name):
23 | "A collection named '\(name)' already exists."
24 | case .collectionNotFound(let name):
25 | "Collection '\(name)' not found."
26 | case .dimensionMismatch(let expected, let got):
27 | "Vector dimension mismatch. Expected \(expected) but got \(got)."
28 | case .loadFailed(let reason):
29 | "Failed to load collection: \(reason)"
30 | case .invalidInput(let reason):
31 | "Invalid input: \(reason)"
32 | }
33 | }
34 | }
35 |
--------------------------------------------------------------------------------
/Sources/VecturaKit/VecturaKit.swift:
--------------------------------------------------------------------------------
1 | import Accelerate
2 | import CoreML
3 | import Embeddings
4 | import Foundation
5 |
6 | @available(macOS 15.0, iOS 18.0, tvOS 18.0, visionOS 2.0, watchOS 11.0, *)
7 | /// A vector database implementation that stores and searches documents using their vector embeddings.
8 | public class VecturaKit: VecturaProtocol {
9 |
10 | /// The configuration for this vector database instance.
11 | private let config: VecturaConfig
12 |
13 | /// In-memory cache of all documents.
14 | private var documents: [UUID: VecturaDocument]
15 |
16 | /// The storage directory for documents.
17 | private let storageDirectory: URL
18 |
19 | /// The storage provider that handles document persistence.
20 | private let storageProvider: VecturaStorage
21 |
22 | /// Cached normalized embeddings for faster searches.
23 | private var normalizedEmbeddings: [UUID: [Float]] = [:]
24 |
25 | /// BM25 index for text search
26 | private var bm25Index: BM25Index?
27 |
28 | /// Swift-Embeddings model bundle that you can reuse (e.g. BERT, XLM-R, CLIP, etc.)
29 | private var bertModel: Bert.ModelBundle?
30 |
31 | // MARK: - Initialization
32 |
33 | public init(config: VecturaConfig) async throws {
34 | self.config = config
35 | self.documents = [:]
36 |
37 | if let customStorageDirectory = config.directoryURL {
38 | let databaseDirectory = customStorageDirectory.appending(path: config.name)
39 | if !FileManager.default.fileExists(atPath: databaseDirectory.path(percentEncoded: false)) {
40 | try FileManager.default.createDirectory(
41 | at: databaseDirectory, withIntermediateDirectories: true)
42 | }
43 | self.storageDirectory = databaseDirectory
44 | } else {
45 | // Create default storage directory
46 | self.storageDirectory = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)
47 | .first!
48 | .appendingPathComponent("VecturaKit")
49 | .appendingPathComponent(config.name)
50 | }
51 |
52 | try FileManager.default.createDirectory(at: storageDirectory, withIntermediateDirectories: true)
53 |
54 | // Instantiate the storage provider (currently the file-based implementation).
55 | self.storageProvider = try FileStorageProvider(storageDirectory: storageDirectory)
56 |
57 | // Load existing documents using the storage provider.
58 | let storedDocuments = try await storageProvider.loadDocuments()
59 | for doc in storedDocuments {
60 | self.documents[doc.id] = doc
61 | // Compute normalized embedding and store in cache.
62 | let norm = l2Norm(doc.embedding)
63 | var divisor = norm + 1e-9
64 | var normalized = [Float](repeating: 0, count: doc.embedding.count)
65 | vDSP_vsdiv(doc.embedding, 1, &divisor, &normalized, 1, vDSP_Length(doc.embedding.count))
66 | self.normalizedEmbeddings[doc.id] = normalized
67 | }
68 | }
69 |
70 | /// Adds multiple documents to the vector store in batch.
71 | public func addDocuments(
72 | texts: [String],
73 | ids: [UUID]? = nil,
74 | model: VecturaModelSource = .default
75 | ) async throws -> [UUID] {
76 | if let ids = ids, ids.count != texts.count {
77 | throw VecturaError.invalidInput("Number of IDs must match number of texts")
78 | }
79 |
80 | if bertModel == nil {
81 | bertModel = try await Bert.loadModelBundle(from: model)
82 | }
83 |
84 | guard let modelBundle = bertModel else {
85 | throw VecturaError.invalidInput("Failed to load BERT model: \(model)")
86 | }
87 |
88 | let embeddingsTensor = try modelBundle.batchEncode(texts)
89 | let shape = embeddingsTensor.shape
90 |
91 | if shape.count != 2 {
92 | throw VecturaError.invalidInput("Expected shape [N, D], got \(shape)")
93 | }
94 |
95 | if shape[1] != config.dimension {
96 | throw VecturaError.dimensionMismatch(
97 | expected: config.dimension,
98 | got: shape[1]
99 | )
100 | }
101 |
102 | let embeddingShapedArray = await embeddingsTensor.cast(to: Float.self).shapedArray(
103 | of: Float.self)
104 | let allScalars = embeddingShapedArray.scalars
105 |
106 | var documentIds = [UUID]()
107 | var documentsToSave = [VecturaDocument]()
108 |
109 | for i in 0.. [VecturaSearchResult] {
166 | if queryEmbedding.count != config.dimension {
167 | throw VecturaError.dimensionMismatch(
168 | expected: config.dimension,
169 | got: queryEmbedding.count
170 | )
171 | }
172 |
173 | // Normalize the query vector
174 | let norm = l2Norm(queryEmbedding)
175 | var divisor = norm + 1e-9
176 | var normalizedQuery = [Float](repeating: 0, count: queryEmbedding.count)
177 | vDSP_vsdiv(queryEmbedding, 1, &divisor, &normalizedQuery, 1, vDSP_Length(queryEmbedding.count))
178 |
179 | // Build a matrix of normalized document embeddings in row-major order
180 | var docIds = [UUID]()
181 | var matrix = [Float]()
182 | matrix.reserveCapacity(documents.count * config.dimension) // Pre-allocate for better performance
183 |
184 | for doc in documents.values {
185 | if let normalized = normalizedEmbeddings[doc.id] {
186 | docIds.append(doc.id)
187 | matrix.append(contentsOf: normalized)
188 | }
189 | }
190 |
191 | let docsCount = docIds.count
192 | if docsCount == 0 {
193 | return []
194 | }
195 |
196 | let M = Int32(docsCount) // number of rows (documents)
197 | let N = Int32(config.dimension) // number of columns (embedding dimension)
198 | var similarities = [Float](repeating: 0, count: docsCount)
199 |
200 | // Convert Int32 to Int for LAPACK compatibility
201 | let mInt = Int(M) // Convert number of rows
202 | let nInt = Int(N) // Convert number of columns
203 | let ldInt = Int(N) // Convert leading dimension
204 |
205 | // Compute all similarities at once using matrix-vector multiplication
206 | // Matrix is in row-major order, so we use CblasNoTrans
207 | cblas_sgemv(
208 | CblasRowMajor, // matrix layout
209 | CblasNoTrans, // no transpose needed for row-major
210 | mInt, // number of rows (documents) as Int
211 | nInt, // number of columns (dimension) as Int
212 | 1.0, // alpha scaling factor
213 | matrix, // matrix
214 | ldInt, // leading dimension as Int
215 | normalizedQuery, // vector
216 | 1, // vector increment
217 | 0.0, // beta scaling factor
218 | &similarities, // result vector
219 | 1 // result increment
220 | )
221 |
222 | // Construct the results
223 | var results = [VecturaSearchResult]()
224 | results.reserveCapacity(docsCount) // Pre-allocate for better performance
225 |
226 | for (i, similarity) in similarities.enumerated() {
227 | if let minT = threshold ?? config.searchOptions.minThreshold, similarity < minT {
228 | continue
229 | }
230 | if let doc = documents[docIds[i]] {
231 | results.append(
232 | VecturaSearchResult(
233 | id: doc.id,
234 | text: doc.text,
235 | score: similarity,
236 | createdAt: doc.createdAt
237 | )
238 | )
239 | }
240 | }
241 |
242 | results.sort { $0.score > $1.score }
243 |
244 | let limit = numResults ?? config.searchOptions.defaultNumResults
245 | return Array(results.prefix(limit))
246 | }
247 |
248 | public func search(
249 | query: String,
250 | numResults: Int? = nil,
251 | threshold: Float? = nil,
252 | model: VecturaModelSource = .default
253 | ) async throws -> [VecturaSearchResult] {
254 | if bertModel == nil {
255 | bertModel = try await Bert.loadModelBundle(from: model)
256 | }
257 |
258 | guard let modelBundle = bertModel else {
259 | throw VecturaError.invalidInput("Failed to load BERT model: \(model)")
260 | }
261 |
262 | // Initialize BM25 index if needed
263 | if bm25Index == nil {
264 | let docs = documents.values.map { $0 }
265 | bm25Index = BM25Index(
266 | documents: docs,
267 | k1: config.searchOptions.k1,
268 | b: config.searchOptions.b
269 | )
270 | }
271 |
272 | // Get vector similarity results
273 | let queryEmbeddingTensor = try modelBundle.encode(query)
274 | let queryEmbeddingFloatArray = await tensorToArray(queryEmbeddingTensor)
275 | let vectorResults = try await search(
276 | query: queryEmbeddingFloatArray,
277 | numResults: nil,
278 | threshold: nil
279 | )
280 |
281 | let bm25Results =
282 | bm25Index?.search(
283 | query: query,
284 | topK: documents.count
285 | ) ?? []
286 |
287 | // Create a map of document IDs to their BM25 scores
288 | let bm25Scores = Dictionary(
289 | bm25Results.map { ($0.document.id, $0.score) },
290 | uniquingKeysWith: { first, _ in first }
291 | )
292 |
293 | // Combine scores using hybrid scoring
294 | var hybridResults = vectorResults.map { result in
295 | let bm25Score = bm25Scores[result.id] ?? 0
296 | let hybridScore = VecturaDocument(
297 | id: result.id,
298 | text: result.text,
299 | embedding: []
300 | ).hybridScore(
301 | vectorScore: result.score,
302 | bm25Score: bm25Score,
303 | weight: config.searchOptions.hybridWeight
304 | )
305 |
306 | return VecturaSearchResult(
307 | id: result.id,
308 | text: result.text,
309 | score: hybridScore,
310 | createdAt: result.createdAt
311 | )
312 | }
313 |
314 | hybridResults.sort { $0.score > $1.score }
315 |
316 | if let threshold = threshold ?? config.searchOptions.minThreshold {
317 | hybridResults = hybridResults.filter { $0.score >= threshold }
318 | }
319 |
320 | let limit = numResults ?? config.searchOptions.defaultNumResults
321 | return Array(hybridResults.prefix(limit))
322 | }
323 |
324 | @_disfavoredOverload
325 | public func search(
326 | query: String,
327 | numResults: Int? = nil,
328 | threshold: Float? = nil,
329 | modelId: String = VecturaModelSource.defaultModelId
330 | ) async throws -> [VecturaSearchResult] {
331 | try await search(
332 | query: query, numResults: numResults, threshold: threshold, model: .id(modelId))
333 | }
334 |
335 | public func reset() async throws {
336 | documents.removeAll()
337 | normalizedEmbeddings.removeAll()
338 |
339 | let files = try FileManager.default.contentsOfDirectory(
340 | at: storageDirectory, includingPropertiesForKeys: nil)
341 | for fileURL in files {
342 | try FileManager.default.removeItem(at: fileURL)
343 | }
344 | }
345 |
346 | public func deleteDocuments(ids: [UUID]) async throws {
347 | if bm25Index != nil {
348 | let remainingDocs = documents.values.filter { !ids.contains($0.id) }
349 | bm25Index = BM25Index(
350 | documents: Array(remainingDocs),
351 | k1: config.searchOptions.k1,
352 | b: config.searchOptions.b
353 | )
354 | }
355 |
356 | for id in ids {
357 | documents[id] = nil
358 | normalizedEmbeddings[id] = nil
359 |
360 | let documentURL = storageDirectory.appendingPathComponent("\(id).json")
361 | try FileManager.default.removeItem(at: documentURL)
362 | }
363 | }
364 |
365 | public func updateDocument(
366 | id: UUID,
367 | newText: String,
368 | model: VecturaModelSource = .default
369 | ) async throws {
370 | try await deleteDocuments(ids: [id])
371 |
372 | _ = try await addDocument(text: newText, id: id, model: model)
373 | }
374 |
375 | @_disfavoredOverload
376 | public func updateDocument(
377 | id: UUID,
378 | newText: String,
379 | modelId: String = VecturaModelSource.defaultModelId
380 | ) async throws {
381 | try await updateDocument(id: id, newText: newText, model: .id(modelId))
382 | }
383 |
384 | // MARK: - Private
385 |
386 | private func tensorToArray(_ tensor: MLTensor) async -> [Float] {
387 | let shaped = await tensor.cast(to: Float.self).shapedArray(of: Float.self)
388 | return shaped.scalars
389 | }
390 |
391 | private func dotProduct(_ a: [Float], _ b: [Float]) -> Float {
392 | var result: Float = 0
393 | vDSP_dotpr(a, 1, b, 1, &result, vDSP_Length(a.count))
394 | return result
395 | }
396 |
397 | private func l2Norm(_ v: [Float]) -> Float {
398 | var sumSquares: Float = 0
399 | vDSP_svesq(v, 1, &sumSquares, vDSP_Length(v.count))
400 | return sqrt(sumSquares)
401 | }
402 | }
403 |
404 | @available(macOS 15.0, iOS 18.0, tvOS 18.0, visionOS 2.0, watchOS 11.0, *)
405 | extension Bert {
406 | static func loadModelBundle(from source: VecturaModelSource) async throws -> Bert.ModelBundle {
407 | switch source {
408 | case .id(let modelId):
409 | try await loadModelBundle(from: modelId)
410 | case .folder(let url):
411 | try await loadModelBundle(from: url)
412 | }
413 | }
414 | }
415 |
--------------------------------------------------------------------------------
/Sources/VecturaKit/VecturaModelSource.swift:
--------------------------------------------------------------------------------
1 | import Foundation
2 |
3 | /// Specifies where to obtain the resources for an embedding model.
4 | public enum VecturaModelSource: Sendable, CustomStringConvertible {
5 | /// Automatically fetch the model from a remote repository based on its id.
6 | case id(_ id: String)
7 | /// Load a local model from the specified directory URL.
8 | case folder(_ url: URL)
9 | }
10 |
11 | public extension VecturaModelSource {
12 | /// The default model identifier when not otherwise specified.
13 | static let defaultModelId: String = "sentence-transformers/all-MiniLM-L6-v2"
14 |
15 | /// The default model when not otherwise specified.
16 | static let `default` = VecturaModelSource.id(VecturaModelSource.defaultModelId)
17 | }
18 |
19 | public extension VecturaModelSource {
20 | var description: String {
21 | switch self {
22 | case .id(let id): id
23 | case .folder(let url): url.path(percentEncoded: false)
24 | }
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/Sources/VecturaKit/VecturaProtocol.swift:
--------------------------------------------------------------------------------
1 | import Foundation
2 |
3 | /// A protocol defining the requirements for a vector database instance.
4 | public protocol VecturaProtocol {
5 |
6 | /// Adds multiple documents to the vector store in batch.
7 | ///
8 | /// - Parameters:
9 | /// - texts: The text contents of the documents.
10 | /// - ids: Optional unique identifiers for the documents.
11 | /// - model: A ``VecturaModelSource`` specifying how to load the model.
12 | /// (e.g.,`.id("sentence-transformers/all-MiniLM-L6-v2")`).
13 | /// - Returns: The IDs of the added documents.
14 | func addDocuments(
15 | texts: [String],
16 | ids: [UUID]?,
17 | model: VecturaModelSource
18 | ) async throws -> [UUID]
19 |
20 | /// Searches for similar documents using a *pre-computed query embedding*.
21 | ///
22 | /// - Parameters:
23 | /// - query: The query vector to search with.
24 | /// - numResults: Maximum number of results to return.
25 | /// - threshold: Minimum similarity threshold.
26 | /// - Returns: An array of search results ordered by similarity.
27 | func search(
28 | query: [Float],
29 | numResults: Int?,
30 | threshold: Float?
31 | ) async throws -> [VecturaSearchResult]
32 |
33 | /// Removes all documents from the vector store.
34 | func reset() async throws
35 | }
36 |
37 | // MARK: - Default Implementations
38 |
39 | public extension VecturaProtocol {
40 |
41 | /// Adds a document to the vector store by embedding text.
42 | ///
43 | /// - Parameters:
44 | /// - text: The text content of the document.
45 | /// - id: Optional unique identifier for the document.
46 | /// - model: A ``VecturaModelSource`` specifying how to load the model.
47 | /// (e.g.,`.id("sentence-transformers/all-MiniLM-L6-v2")`).
48 | /// - Returns: The ID of the added document.
49 | func addDocument(
50 | text: String,
51 | id: UUID? = nil,
52 | model: VecturaModelSource = .default
53 | ) async throws -> UUID {
54 | let ids = try await addDocuments(
55 | texts: [text],
56 | ids: id.map { [$0] },
57 | model: model
58 | )
59 | return ids[0]
60 | }
61 |
62 | /// Adds a document to the vector store by embedding text.
63 | ///
64 | /// - Parameters:
65 | /// - text: The text content of the document.
66 | /// - id: Optional unique identifier for the document.
67 | /// - modelId: Identifier of the model to use for generating the embedding
68 | /// (e.g., "sentence-transformers/all-MiniLM-L6-v2").
69 | /// - Returns: The ID of the added document.
70 | @_disfavoredOverload
71 | func addDocument(
72 | text: String,
73 | id: UUID?,
74 | modelId: String = VecturaModelSource.defaultModelId
75 | ) async throws -> UUID {
76 | try await addDocument(text: text, id: id, model: .id(modelId))
77 | }
78 |
79 | /// Adds multiple documents to the vector store in batch.
80 | ///
81 | /// - Parameters:
82 | /// - texts: The text contents of the documents.
83 | /// - ids: Optional unique identifiers for the documents.
84 | /// - modelId: Identifier of the model to use for generating the embedding
85 | /// (e.g.,`.id("sentence-transformers/all-MiniLM-L6-v2")`).
86 | /// - Returns: The IDs of the added documents.
87 | func addDocuments(
88 | texts: [String],
89 | ids: [UUID]? = nil,
90 | modelId: String = VecturaModelSource.defaultModelId
91 | ) async throws -> [UUID] {
92 | try await addDocuments(texts: texts, ids: ids, model: .id(modelId))
93 | }
94 | }
95 |
--------------------------------------------------------------------------------
/Sources/VecturaKit/VecturaSearchResult.swift:
--------------------------------------------------------------------------------
1 | import Foundation
2 |
3 | /// Represents a search result from the vector database.
4 | public struct VecturaSearchResult: Identifiable, Sendable {
5 |
6 | /// The unique identifier of the matching document.
7 | public let id: UUID
8 |
9 | /// The text content of the matching document.
10 | public let text: String
11 |
12 | /// The similarity score between the query and the document.
13 | public let score: Float
14 |
15 | /// The timestamp when the document was created.
16 | public let createdAt: Date
17 |
18 | /// Creates a new search result with the given properties.
19 | ///
20 | /// - Parameters:
21 | /// - id: The unique identifier of the matching document.
22 | /// - text: The text content of the matching document.
23 | /// - score: The similarity score between the query and the document.
24 | /// - createdAt: The timestamp when the document was created.
25 | public init(id: UUID, text: String, score: Float, createdAt: Date) {
26 | self.id = id
27 | self.text = text
28 | self.score = score
29 | self.createdAt = createdAt
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/Sources/VecturaKit/VecturaStorage.swift:
--------------------------------------------------------------------------------
1 | import Foundation
2 |
3 | /// VecturaStorage protocol abstracts the persistence layer for VecturaDocuments.
4 | ///
5 | /// It allows for multiple underlying storage implementations (e.g., File-based or SQLite)
6 | /// without changing the higher-level API used in VecturaKit.
7 | public protocol VecturaStorage {
8 | /// Prepares or creates the storage location for documents if needed.
9 | func createStorageDirectoryIfNeeded() async throws
10 |
11 | /// Loads the persisted documents.
12 | ///
13 | /// - Returns: An array of VecturaDocument.
14 | func loadDocuments() async throws -> [VecturaDocument]
15 |
16 | /// Saves a document.
17 | ///
18 | /// - Parameter document: The document to save.
19 | func saveDocument(_ document: VecturaDocument) async throws
20 |
21 | /// Deletes a document by its unique identifier.
22 | ///
23 | /// - Parameter id: The identifier of the document to be deleted.
24 | func deleteDocument(withID id: UUID) async throws
25 |
26 | /// Updates an existing document. The document is replaced or modified as needed.
27 | ///
28 | /// - Parameter document: The updated document.
29 | func updateDocument(_ document: VecturaDocument) async throws
30 | }
31 |
--------------------------------------------------------------------------------
/Sources/VecturaMLXCLI/VecturaMLXCLI.swift:
--------------------------------------------------------------------------------
1 | import ArgumentParser
2 | import Foundation
3 | import MLXEmbedders
4 | import VecturaKit
5 | import VecturaMLXKit
6 |
7 | @available(macOS 14.0, iOS 17.0, tvOS 17.0, visionOS 1.0, watchOS 10.0, *)
8 | @main
9 | struct VecturaMLXCLI: AsyncParsableCommand {
10 | struct DocumentID: ExpressibleByArgument, Decodable {
11 | let uuid: UUID
12 |
13 | init(_ uuid: UUID) {
14 | self.uuid = uuid
15 | }
16 |
17 | init?(argument: String) {
18 | guard let uuid = UUID(uuidString: argument) else { return nil }
19 | self.uuid = uuid
20 | }
21 | }
22 |
23 | static let configuration = CommandConfiguration(
24 | commandName: "vectura-mlx",
25 | abstract: "A CLI tool for VecturaMLXKit vector database using MLX",
26 | subcommands: [Add.self, Search.self, Update.self, Delete.self, Reset.self, Mock.self]
27 | )
28 |
29 | static func setupDB(
30 | dbName: String, modelConfiguration: MLXEmbedders.ModelConfiguration = .nomic_text_v1_5
31 | )
32 | async throws
33 | -> VecturaMLXKit
34 | {
35 | let config = VecturaConfig(
36 | name: dbName,
37 | dimension: 768 // nomic_text_v1_5 model outputs 768-dimensional embeddings
38 | )
39 | return try await VecturaMLXKit(config: config, modelConfiguration: modelConfiguration)
40 | }
41 | }
42 |
43 | @available(macOS 14.0, iOS 17.0, tvOS 17.0, visionOS 1.0, watchOS 10.0, *)
44 | extension VecturaMLXCLI {
45 | struct Mock: AsyncParsableCommand {
46 | static let configuration = CommandConfiguration(
47 | abstract: "Run a mock demonstration with sample data"
48 | )
49 |
50 | @Option(name: [.long, .customShort("d")], help: "Database name")
51 | var dbName: String = "vectura-mlx-cli-db"
52 |
53 | mutating func run() async throws {
54 | print("Starting mock command...")
55 |
56 | print("Setting up database...")
57 | let db = try await VecturaMLXCLI.setupDB(dbName: dbName)
58 | print("Database setup complete")
59 |
60 | // First, reset the database
61 | print("\n🧹 Resetting database...")
62 | try await db.reset()
63 | print("Reset complete")
64 |
65 | // Add sample documents
66 | print("\n📝 Adding sample documents...")
67 | let sampleTexts = [
68 | "The quick brown fox jumps over the lazy dog",
69 | "To be or not to be, that is the question",
70 | "All that glitters is not gold",
71 | "A journey of a thousand miles begins with a single step",
72 | "Where there's smoke, there's fire",
73 | ]
74 |
75 | let ids = try await db.addDocuments(texts: sampleTexts)
76 | print("Added \(ids.count) documents:")
77 | for (id, text) in zip(ids, sampleTexts) {
78 | print("ID: \(id)")
79 | print("Text: \(text)")
80 | print("---")
81 | }
82 |
83 | // Search for documents
84 | print("\n🔍 Searching for 'journey'...")
85 | let results = try await db.search(query: "journey")
86 |
87 | print("Found \(results.count) results:")
88 | for result in results {
89 | print("ID: \(result.id)")
90 | print("Text: \(result.text)")
91 | print("Score: \(result.score)")
92 | print("Created: \(result.createdAt)")
93 | print("---")
94 | }
95 |
96 | // Update a document
97 | if let firstId = ids.first {
98 | print("\n✏️ Updating first document...")
99 | let newText = "The quick red fox jumps over the sleeping dog"
100 | try await db.updateDocument(id: firstId, newText: newText)
101 | print("Updated document \(firstId) with new text: \(newText)")
102 | }
103 |
104 | // Delete last document
105 | if let lastId = ids.last {
106 | print("\n🗑️ Deleting last document...")
107 | try await db.deleteDocuments(ids: [lastId])
108 | print("Deleted document \(lastId)")
109 | }
110 |
111 | print("\n✨ Mock demonstration completed!")
112 | }
113 | }
114 |
115 | struct Add: AsyncParsableCommand {
116 | static let configuration = CommandConfiguration(
117 | abstract: "Add documents to the vector database"
118 | )
119 |
120 | @Option(name: [.long, .customShort("d")], help: "Database name")
121 | var dbName: String = "vectura-mlx-cli-db"
122 |
123 | @Argument(help: "Text content to add")
124 | var text: [String]
125 |
126 | mutating func run() async throws {
127 | let db = try await VecturaMLXCLI.setupDB(dbName: dbName)
128 | let ids = try await db.addDocuments(texts: text)
129 | print("Added \(ids.count) documents:")
130 | for (id, text) in zip(ids, text) {
131 | print("ID: \(id)")
132 | print("Text: \(text)")
133 | print("---")
134 | }
135 | }
136 | }
137 |
138 | struct Search: AsyncParsableCommand {
139 | static let configuration = CommandConfiguration(
140 | abstract: "Search documents in the vector database"
141 | )
142 |
143 | @Option(name: [.long, .customShort("d")], help: "Database name")
144 | var dbName: String = "vectura-mlx-cli-db"
145 |
146 | @Option(name: [.long, .customShort("t")], help: "Minimum similarity threshold")
147 | var threshold: Float?
148 |
149 | @Option(name: [.long, .customShort("n")], help: "Number of results to return")
150 | var numResults: Int?
151 |
152 | @Argument(help: "Search query")
153 | var query: String
154 |
155 | mutating func run() async throws {
156 | guard !query.isEmpty else {
157 | print("Error: Query cannot be empty.")
158 | throw ExitCode.failure
159 | }
160 |
161 | let db = try await VecturaMLXCLI.setupDB(dbName: dbName)
162 | let results = try await db.search(
163 | query: query,
164 | numResults: numResults,
165 | threshold: threshold
166 | )
167 |
168 | print("Found \(results.count) results:")
169 | for result in results {
170 | print("ID: \(result.id)")
171 | print("Text: \(result.text)")
172 | print("Score: \(result.score)")
173 | print("Created: \(result.createdAt)")
174 | print("---")
175 | }
176 | }
177 | }
178 |
179 | struct Update: AsyncParsableCommand, Decodable {
180 | static let configuration = CommandConfiguration(
181 | abstract: "Update a document in the vector database"
182 | )
183 |
184 | @Option(name: [.long, .customShort("d")], help: "Database name")
185 | var dbName: String = "vectura-mlx-cli-db"
186 |
187 | @Argument(help: "Document ID to update")
188 | var id: DocumentID
189 |
190 | @Argument(help: "New text content")
191 | var newText: String
192 |
193 | mutating func run() async throws {
194 | let db = try await VecturaMLXCLI.setupDB(dbName: dbName)
195 | try await db.updateDocument(id: id.uuid, newText: newText)
196 | print("Updated document \(id.uuid) with new text: \(newText)")
197 | }
198 | }
199 |
200 | struct Delete: AsyncParsableCommand, Decodable {
201 | static let configuration = CommandConfiguration(
202 | abstract: "Delete documents from the vector database"
203 | )
204 |
205 | @Option(name: [.long, .customShort("d")], help: "Database name")
206 | var dbName: String = "vectura-mlx-cli-db"
207 |
208 | @Argument(help: "Document IDs to delete")
209 | var ids: [DocumentID]
210 |
211 | mutating func run() async throws {
212 | let db = try await VecturaMLXCLI.setupDB(dbName: dbName)
213 | try await db.deleteDocuments(ids: ids.map(\.uuid))
214 | print("Deleted \(ids.count) documents")
215 | }
216 | }
217 |
218 | struct Reset: AsyncParsableCommand {
219 | static let configuration = CommandConfiguration(
220 | abstract: "Reset the vector database"
221 | )
222 |
223 | @Option(name: [.long, .customShort("d")], help: "Database name")
224 | var dbName: String = "vectura-mlx-cli-db"
225 |
226 | mutating func run() async throws {
227 | let db = try await VecturaMLXCLI.setupDB(dbName: dbName)
228 | try await db.reset()
229 | print("Database reset successfully")
230 | }
231 | }
232 | }
233 |
--------------------------------------------------------------------------------
/Sources/VecturaMLXKit/MLXEmbedder.swift:
--------------------------------------------------------------------------------
1 | import Foundation
2 | import MLX
3 | import MLXEmbedders
4 | import VecturaKit
5 |
6 | @available(macOS 14.0, iOS 17.0, tvOS 17.0, visionOS 1.0, watchOS 10.0, *)
7 | public class MLXEmbedder {
8 | private let modelContainer: ModelContainer
9 | private let configuration: ModelConfiguration
10 |
11 | public init(configuration: ModelConfiguration = .nomic_text_v1_5) async throws {
12 | self.configuration = configuration
13 | self.modelContainer = try await MLXEmbedders.loadModelContainer(configuration: configuration)
14 | }
15 |
16 | public func embed(texts: [String]) async -> [[Float]] {
17 | await modelContainer.perform { (model: EmbeddingModel, tokenizer, pooling) -> [[Float]] in
18 | let inputs = texts.map {
19 | tokenizer.encode(text: $0, addSpecialTokens: true)
20 | }
21 |
22 | // Pad to longest
23 | let maxLength = inputs.reduce(into: 16) { acc, elem in
24 | acc = max(acc, elem.count)
25 | }
26 |
27 | let padded = stacked(
28 | inputs.map { elem in
29 | MLXArray(
30 | elem
31 | + Array(
32 | repeating: tokenizer.eosTokenId ?? 0,
33 | count: maxLength - elem.count))
34 | })
35 |
36 | let mask = (padded .!= tokenizer.eosTokenId ?? 0)
37 | let tokenTypes = MLXArray.zeros(like: padded)
38 |
39 | let result = pooling(
40 | model(padded, positionIds: nil, tokenTypeIds: tokenTypes, attentionMask: mask),
41 | normalize: true, applyLayerNorm: true
42 | )
43 |
44 | return result.map { $0.asArray(Float.self) }
45 | }
46 | }
47 |
48 | public func embed(text: String) async throws -> [Float] {
49 | let embeddings = await embed(texts: [text])
50 | return embeddings[0]
51 | }
52 | }
53 |
--------------------------------------------------------------------------------
/Sources/VecturaMLXKit/VecturaMLXKit.swift:
--------------------------------------------------------------------------------
1 | import Accelerate
2 | import Foundation
3 | import MLXEmbedders
4 | import VecturaKit
5 |
6 | @available(macOS 14.0, iOS 17.0, tvOS 17.0, visionOS 1.0, watchOS 10.0, *)
7 | public class VecturaMLXKit {
8 | private let config: VecturaConfig
9 | private let embedder: MLXEmbedder
10 | private var documents: [UUID: VecturaDocument] = [:]
11 | private var normalizedEmbeddings: [UUID: [Float]] = [:]
12 | private let storageDirectory: URL
13 |
14 | public init(config: VecturaConfig, modelConfiguration: ModelConfiguration = .nomic_text_v1_5)
15 | async throws
16 | {
17 | self.config = config
18 | self.embedder = try await MLXEmbedder(configuration: modelConfiguration)
19 |
20 | if let customStorageDirectory = config.directoryURL {
21 | let databaseDirectory = customStorageDirectory.appending(path: config.name)
22 |
23 | if !FileManager.default.fileExists(atPath: databaseDirectory.path(percentEncoded: false)) {
24 | try FileManager.default.createDirectory(
25 | at: databaseDirectory, withIntermediateDirectories: true)
26 | }
27 |
28 | self.storageDirectory = databaseDirectory
29 | } else {
30 | // Create default storage directory
31 | self.storageDirectory = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)
32 | .first!
33 | .appendingPathComponent("VecturaKit")
34 | .appendingPathComponent(config.name)
35 | }
36 |
37 | try FileManager.default.createDirectory(at: storageDirectory, withIntermediateDirectories: true)
38 |
39 | // Attempt to load existing docs
40 | try loadDocuments()
41 | }
42 |
43 | public func addDocuments(texts: [String], ids: [UUID]? = nil) async throws -> [UUID] {
44 | if let ids = ids, ids.count != texts.count {
45 | throw VecturaError.invalidInput("Number of IDs must match number of texts")
46 | }
47 |
48 | let embeddings = await embedder.embed(texts: texts)
49 | var documentIds = [UUID]()
50 | var documentsToSave = [VecturaDocument]()
51 |
52 | for (index, text) in texts.enumerated() {
53 | let docId = ids?[index] ?? UUID()
54 | let doc = VecturaDocument(id: docId, text: text, embedding: embeddings[index])
55 |
56 | // Normalize embedding for cosine similarity
57 | let norm = l2Norm(doc.embedding)
58 | var divisor = norm + 1e-9
59 | var normalized = [Float](repeating: 0, count: doc.embedding.count)
60 | vDSP_vsdiv(doc.embedding, 1, &divisor, &normalized, 1, vDSP_Length(doc.embedding.count))
61 |
62 | normalizedEmbeddings[doc.id] = normalized
63 | documents[doc.id] = doc
64 | documentIds.append(docId)
65 | documentsToSave.append(doc)
66 | }
67 |
68 | try await withThrowingTaskGroup(of: Void.self) { group in
69 | let directory = self.storageDirectory
70 |
71 | for doc in documentsToSave {
72 | group.addTask {
73 | let documentURL = directory.appendingPathComponent("\(doc.id).json")
74 | let encoder = JSONEncoder()
75 | encoder.outputFormatting = .prettyPrinted
76 |
77 | let data = try encoder.encode(doc)
78 | try data.write(to: documentURL)
79 | }
80 | }
81 |
82 | try await group.waitForAll()
83 | }
84 |
85 | return documentIds
86 | }
87 |
88 | public func search(query: String, numResults: Int? = nil, threshold: Float? = nil) async throws
89 | -> [VecturaSearchResult]
90 | {
91 | guard !query.isEmpty else {
92 | throw VecturaError.invalidInput("Query cannot be empty")
93 | }
94 |
95 | let queryEmbedding = try await embedder.embed(text: query)
96 |
97 | let norm = l2Norm(queryEmbedding)
98 | var divisorQuery = norm + 1e-9
99 | var normalizedQuery = [Float](repeating: 0, count: queryEmbedding.count)
100 | vDSP_vsdiv(
101 | queryEmbedding, 1, &divisorQuery, &normalizedQuery, 1, vDSP_Length(queryEmbedding.count))
102 |
103 | var results: [VecturaSearchResult] = []
104 |
105 | for doc in documents.values {
106 | guard let normDoc = normalizedEmbeddings[doc.id] else { continue }
107 | let similarity = dotProduct(normalizedQuery, normDoc)
108 |
109 | if let minT = threshold ?? config.searchOptions.minThreshold, similarity < minT {
110 | continue
111 | }
112 |
113 | results.append(
114 | VecturaSearchResult(
115 | id: doc.id,
116 | text: doc.text,
117 | score: similarity,
118 | createdAt: doc.createdAt
119 | )
120 | )
121 | }
122 |
123 | results.sort { $0.score > $1.score }
124 |
125 | let limit = numResults ?? config.searchOptions.defaultNumResults
126 | return Array(results.prefix(limit))
127 | }
128 |
129 | public func deleteDocuments(ids: [UUID]) async throws {
130 | for id in ids {
131 | documents[id] = nil
132 | normalizedEmbeddings[id] = nil
133 |
134 | let documentURL = storageDirectory.appendingPathComponent("\(id).json")
135 | try FileManager.default.removeItem(at: documentURL)
136 | }
137 | }
138 |
139 | public func updateDocument(id: UUID, newText: String) async throws {
140 | try await deleteDocuments(ids: [id])
141 | _ = try await addDocuments(texts: [newText], ids: [id])
142 | }
143 |
144 | public func reset() async throws {
145 | documents.removeAll()
146 | normalizedEmbeddings.removeAll()
147 |
148 | let files = try FileManager.default.contentsOfDirectory(
149 | at: storageDirectory, includingPropertiesForKeys: nil)
150 | for fileURL in files {
151 | try FileManager.default.removeItem(at: fileURL)
152 | }
153 | }
154 |
155 | // MARK: - Private
156 |
157 | private func loadDocuments() throws {
158 | let fileURLs = try FileManager.default.contentsOfDirectory(
159 | at: storageDirectory, includingPropertiesForKeys: nil)
160 |
161 | let decoder = JSONDecoder()
162 | var loadErrors: [String] = []
163 |
164 | for fileURL in fileURLs where fileURL.pathExtension == "json" {
165 | do {
166 | let data = try Data(contentsOf: fileURL)
167 | let doc = try decoder.decode(VecturaDocument.self, from: data)
168 |
169 | // Rebuild normalized embeddings
170 | let norm = l2Norm(doc.embedding)
171 | var divisor = norm + 1e-9
172 | var normalized = [Float](repeating: 0, count: doc.embedding.count)
173 | vDSP_vsdiv(doc.embedding, 1, &divisor, &normalized, 1, vDSP_Length(doc.embedding.count))
174 | normalizedEmbeddings[doc.id] = normalized
175 | documents[doc.id] = doc
176 | } catch {
177 | loadErrors.append(
178 | "Failed to load \(fileURL.lastPathComponent): \(error.localizedDescription)")
179 | }
180 | }
181 |
182 | if !loadErrors.isEmpty {
183 | throw VecturaError.loadFailed(loadErrors.joined(separator: "\n"))
184 | }
185 | }
186 |
187 | private func dotProduct(_ a: [Float], _ b: [Float]) -> Float {
188 | var result: Float = 0
189 | vDSP_dotpr(a, 1, b, 1, &result, vDSP_Length(a.count))
190 | return result
191 | }
192 |
193 | private func l2Norm(_ v: [Float]) -> Float {
194 | var sumSquares: Float = 0
195 | vDSP_svesq(v, 1, &sumSquares, vDSP_Length(v.count))
196 | return sqrt(sumSquares)
197 | }
198 | }
199 |
--------------------------------------------------------------------------------
/Tests/VecturaKitTests/VecturaKitTests.swift:
--------------------------------------------------------------------------------
1 | import XCTest
2 |
3 | @testable import VecturaKit
4 | import Embeddings
5 |
6 | @available(macOS 15.0, iOS 18.0, tvOS 18.0, visionOS 2.0, watchOS 11.0, *)
7 | final class VecturaKitTests: XCTestCase {
8 | var vectura: VecturaKit!
9 | var config: VecturaConfig!
10 |
11 | override func setUp() async throws {
12 | config = VecturaConfig(name: "test-db", dimension: 384)
13 | vectura = try VecturaKit(config: config)
14 | }
15 |
16 | override func tearDown() async throws {
17 | try await vectura.reset()
18 | vectura = nil
19 | }
20 |
21 | func testAddAndSearchDocument() async throws {
22 | let text = "This is a test document"
23 | let id = try await vectura.addDocument(text: text)
24 |
25 | let results = try await vectura.search(query: "test document")
26 | XCTAssertEqual(results.count, 1)
27 | XCTAssertEqual(results[0].id, id)
28 | XCTAssertEqual(results[0].text, text)
29 | }
30 |
31 | func testAddMultipleDocuments() async throws {
32 | let documents = [
33 | "The quick brown fox jumps over the lazy dog",
34 | "Pack my box with five dozen liquor jugs",
35 | "How vexingly quick daft zebras jump",
36 | ]
37 |
38 | let ids = try await vectura.addDocuments(texts: documents)
39 | XCTAssertEqual(ids.count, 3)
40 |
41 | let results = try await vectura.search(query: "quick jumping animals")
42 | XCTAssertGreaterThanOrEqual(results.count, 2)
43 | XCTAssertTrue(results[0].score > results[1].score)
44 | }
45 |
46 | func testPersistence() async throws {
47 | // Add documents
48 | let texts = ["Document 1", "Document 2"]
49 | let ids = try await vectura.addDocuments(texts: texts)
50 |
51 | // Create new instance with same config
52 | let config = VecturaConfig(name: "test-db", dimension: 384)
53 | let newVectura = try VecturaKit(config: config)
54 |
55 | // Search should work with new instance
56 | let results = try await newVectura.search(query: "Document")
57 | XCTAssertEqual(results.count, 2)
58 | XCTAssertTrue(ids.contains(results[0].id))
59 | XCTAssertTrue(ids.contains(results[1].id))
60 | }
61 |
62 | func testSearchThreshold() async throws {
63 | let documents = [
64 | "Very relevant document about cats",
65 | "Somewhat relevant about pets",
66 | "Completely irrelevant about weather",
67 | ]
68 | _ = try await vectura.addDocuments(texts: documents)
69 |
70 | // With high threshold, should get fewer results
71 | let results = try await vectura.search(query: "cats and pets", threshold: 0.8)
72 | XCTAssertLessThan(results.count, 3)
73 | }
74 |
75 | func testCustomIds() async throws {
76 | let customId = UUID()
77 | let text = "Document with custom ID"
78 |
79 | let resultId = try await vectura.addDocument(text: text, id: customId)
80 | XCTAssertEqual(customId, resultId)
81 |
82 | let results = try await vectura.search(query: text)
83 | XCTAssertEqual(results[0].id, customId)
84 | }
85 |
86 | func testModelReuse() async throws {
87 | // Multiple operations should reuse the same model
88 | let start = Date()
89 | for i in 1...5 {
90 | _ = try await vectura.addDocument(text: "Test document \(i)")
91 | }
92 | let duration = Date().timeIntervalSince(start)
93 |
94 | // If model is being reused, this should be relatively quick
95 | XCTAssertLessThan(duration, 5.0) // Adjust threshold as needed
96 | }
97 |
98 | func testEmptySearch() async throws {
99 | let results = try await vectura.search(query: "test query")
100 | XCTAssertEqual(results.count, 0, "Search on empty database should return no results")
101 | }
102 |
103 | func testDimensionMismatch() async throws {
104 | // Test with wrong dimension config
105 | let wrongConfig = VecturaConfig(name: "wrong-dim-db", dimension: 128)
106 | let wrongVectura = try VecturaKit(config: wrongConfig)
107 |
108 | let text = "Test document"
109 |
110 | do {
111 | _ = try await wrongVectura.addDocument(text: text)
112 | XCTFail("Expected dimension mismatch error")
113 | } catch let error as VecturaError {
114 | // Should throw dimension mismatch since BERT model outputs 384 dimensions
115 | switch error {
116 | case .dimensionMismatch(let expected, let got):
117 | XCTAssertEqual(expected, 128)
118 | XCTAssertEqual(got, 384)
119 | default:
120 | XCTFail("Wrong error type: \(error)")
121 | }
122 | }
123 | }
124 |
125 | func testDuplicateIds() async throws {
126 | let id = UUID()
127 | let text1 = "First document"
128 | let text2 = "Second document"
129 |
130 | // Add first document
131 | _ = try await vectura.addDocument(text: text1, id: id)
132 |
133 | // Adding second document with same ID should overwrite
134 | _ = try await vectura.addDocument(text: text2, id: id)
135 |
136 | let results = try await vectura.search(query: text2)
137 | XCTAssertEqual(results.count, 1)
138 | XCTAssertEqual(results[0].text, text2)
139 | }
140 |
141 | func testSearchThresholdEdgeCases() async throws {
142 | let documents = ["Test document"]
143 | _ = try await vectura.addDocuments(texts: documents)
144 |
145 | // Test with threshold = 1.0 (exact match only)
146 | let perfectResults = try await vectura.search(query: "Test document", threshold: 1.0)
147 | XCTAssertEqual(perfectResults.count, 0) // Should find no perfect matches due to encoding differences
148 |
149 | // Test with threshold = 0.0 (all matches)
150 | let allResults = try await vectura.search(query: "completely different", threshold: 0.0)
151 | XCTAssertEqual(allResults.count, 1) // Should return all documents
152 | }
153 |
154 | func testLargeNumberOfDocuments() async throws {
155 | let documentCount = 100
156 | var documents: [String] = []
157 |
158 | for i in 0.. results[1].score)
210 | }
211 |
212 | func testCustomStorageDirectory() async throws {
213 | let customDirectoryURL = URL(filePath: NSTemporaryDirectory()).appending(path: "VecturaKitTest")
214 | defer { try? FileManager.default.removeItem(at: customDirectoryURL) }
215 |
216 | let instance = try VecturaKit(config: .init(name: "test", directoryURL: customDirectoryURL, dimension: 384))
217 | let text = "Test document"
218 | let id = UUID()
219 | _ = try await instance.addDocument(text: text, id: id)
220 |
221 | let documentPath = customDirectoryURL.appending(path: "test/\(id).json").path(percentEncoded: false)
222 | XCTAssertTrue(FileManager.default.fileExists(atPath: documentPath), "Custom storage directory inserted document doesn't exist at \(documentPath)")
223 | }
224 | }
225 |
--------------------------------------------------------------------------------
/Tests/VecturaMLXKitTests/VecturaMLXKitTests.swift:
--------------------------------------------------------------------------------
1 | import XCTest
2 | import Foundation
3 | @testable import VecturaMLXKit
4 | @testable import VecturaKit
5 |
6 | @available(macOS 14.0, iOS 17.0, tvOS 17.0, watchOS 10.0, *)
7 | final class VecturaMLXKitTests: XCTestCase {
8 |
9 | var testDirectory: URL!
10 | // Set a dimension matching your model expectation (e.g., 768)
11 | let testDimension = 768
12 |
13 | override func setUpWithError() throws {
14 | // Create a temporary directory for testing.
15 | let temp = FileManager.default.temporaryDirectory
16 | testDirectory = temp.appendingPathComponent("VecturaMLXKitTests", isDirectory: true)
17 | if FileManager.default.fileExists(atPath: testDirectory.path) {
18 | try FileManager.default.removeItem(at: testDirectory)
19 | }
20 | try FileManager.default.createDirectory(at: testDirectory, withIntermediateDirectories: true)
21 | }
22 |
23 | override func tearDownWithError() throws {
24 | // Clean up the temporary directory.
25 | if FileManager.default.fileExists(atPath: testDirectory.path) {
26 | try FileManager.default.removeItem(at: testDirectory)
27 | }
28 | }
29 |
30 | func testAddAndSearch() async throws {
31 | // Create a test config with a minThreshold of 0 so any document is returned.
32 | let config = VecturaConfig(
33 | name: "TestDB",
34 | directoryURL: testDirectory,
35 | dimension: testDimension,
36 | searchOptions: VecturaConfig.SearchOptions(defaultNumResults: 10, minThreshold: 0, hybridWeight: 0.5, k1: 1.2, b: 0.75)
37 | )
38 | let kit = try await VecturaMLXKit(config: config, modelConfiguration: .nomic_text_v1_5)
39 |
40 | let text = "Hello world"
41 | let ids = try await kit.addDocuments(texts: [text])
42 | XCTAssertEqual(ids.count, 1, "Should add exactly one document.")
43 |
44 | // Perform a search using the same text.
45 | let results = try await kit.search(query: text)
46 | XCTAssertEqual(results.count, 1, "The search should return one result after adding one document.")
47 | XCTAssertEqual(results.first?.text, text, "The text of the returned document should match the added text.")
48 | }
49 |
50 | func testDeleteDocuments() async throws {
51 | let config = VecturaConfig(
52 | name: "TestDB",
53 | directoryURL: testDirectory,
54 | dimension: testDimension,
55 | searchOptions: VecturaConfig.SearchOptions(defaultNumResults: 10, minThreshold: 0, hybridWeight: 0.5, k1: 1.2, b: 0.75)
56 | )
57 | let kit = try await VecturaMLXKit(config: config, modelConfiguration: .nomic_text_v1_5)
58 |
59 | let text = "Delete me"
60 | let ids = try await kit.addDocuments(texts: [text])
61 | XCTAssertEqual(ids.count, 1, "Should add exactly one document.")
62 |
63 | try await kit.deleteDocuments(ids: ids)
64 |
65 | let results = try await kit.search(query: text)
66 | XCTAssertTrue(results.isEmpty, "After deletion, the document should not be returned in search results.")
67 | }
68 |
69 | func testUpdateDocument() async throws {
70 | let config = VecturaConfig(
71 | name: "TestDB",
72 | directoryURL: testDirectory,
73 | dimension: testDimension,
74 | searchOptions: VecturaConfig.SearchOptions(defaultNumResults: 10, minThreshold: 0, hybridWeight: 0.5, k1: 1.2, b: 0.75)
75 | )
76 | let kit = try await VecturaMLXKit(config: config, modelConfiguration: .nomic_text_v1_5)
77 |
78 | let originalText = "Original text"
79 | let updatedText = "Updated text"
80 | let ids = try await kit.addDocuments(texts: [originalText])
81 | XCTAssertEqual(ids.count, 1, "Should add exactly one document.")
82 |
83 | let documentID = ids.first!
84 | try await kit.updateDocument(id: documentID, newText: updatedText)
85 |
86 | let results = try await kit.search(query: updatedText)
87 | XCTAssertEqual(results.count, 1, "One document should be returned after update.")
88 | XCTAssertEqual(results.first?.text, updatedText, "The document text should be updated in the search results.")
89 | }
90 |
91 | func testReset() async throws {
92 | let config = VecturaConfig(
93 | name: "TestDB",
94 | directoryURL: testDirectory,
95 | dimension: testDimension,
96 | searchOptions: VecturaConfig.SearchOptions(defaultNumResults: 10, minThreshold: 0, hybridWeight: 0.5, k1: 1.2, b: 0.75)
97 | )
98 | let kit = try await VecturaMLXKit(config: config, modelConfiguration: .nomic_text_v1_5)
99 |
100 | _ = try await kit.addDocuments(texts: ["Doc1", "Doc2"])
101 | try await kit.reset()
102 |
103 | let results = try await kit.search(query: "Doc")
104 | XCTAssertTrue(results.isEmpty, "After a reset, search should return no results.")
105 | }
106 |
107 | // MARK: - Robust Search Tests
108 |
109 | func testSearchMultipleDocuments() async throws {
110 | let config = VecturaConfig(
111 | name: "TestMLXDB",
112 | directoryURL: testDirectory,
113 | dimension: testDimension,
114 | searchOptions: VecturaConfig.SearchOptions(defaultNumResults: 10, minThreshold: 0, hybridWeight: 0.5, k1: 1.2, b: 0.75)
115 | )
116 | let kit = try await VecturaMLXKit(config: config, modelConfiguration: .nomic_text_v1_5)
117 |
118 | // Add several documents with overlapping keywords.
119 | let texts = [
120 | "The quick brown fox jumps over the lazy dog",
121 | "A fast brown fox leaps over lazy hounds",
122 | "An agile brown fox",
123 | "Lazy dogs sleep all day",
124 | "Quick and nimble foxes"
125 | ]
126 | _ = try await kit.addDocuments(texts: texts)
127 |
128 | // Search for an expression close to "brown fox".
129 | let results = try await kit.search(query: "brown fox")
130 |
131 | // We expect at least two results related to 'brown fox'.
132 | XCTAssertGreaterThanOrEqual(results.count, 2, "Should return at least two documents related to 'brown fox'.")
133 |
134 | // Verify that results are sorted in descending order by score.
135 | for i in 1..