├── .gitignore
├── octanedb
    ├── _version.py
    ├── __init__.py
    ├── cli.py
    ├── embeddings.py
    ├── query.py
    ├── utils.py
    ├── storage.py
    ├── index.py
    ├── core.py
    └── collection.py
├── .pypirc.template
├── MANIFEST.in
├── LICENSE
├── CHANGELOG.md
├── pyproject.toml
├── install.py
├── PYPI_PUBLISHING_GUIDE.md
├── publish_to_pypi.py
├── README.md
└── benchmark_octanedb.py


/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | demo.py
3 | tests/
4 | working_example.py
5 | dist/
6 | octanedb.egg-info/
7 | build_package.py
8 | 


--------------------------------------------------------------------------------
/octanedb/_version.py:
--------------------------------------------------------------------------------
1 | """
2 | Version information for OctaneDB.
3 | This file is automatically generated by setuptools_scm.
4 | """
5 | 
6 | __version__ = "1.0.1"
7 | __version_tuple__ = (1, 0, 1)
8 | 


--------------------------------------------------------------------------------
/.pypirc.template:
--------------------------------------------------------------------------------
 1 | # PyPI configuration file template
 2 | # Copy this to ~/.pypirc and fill in your credentials
 3 | 
 4 | [distutils]
 5 | index-servers =
 6 |     pypi
 7 |     testpypi
 8 | 
 9 | [pypi]
10 | username = __token__
11 | password = pypi-<your-api-token-here>
12 | repository = https://upload.pypi.org/legacy/
13 | 
14 | [testpypi]
15 | username = __token__
16 | password = pypi-<your-test-api-token-here>
17 | repository = https://test.pypi.org/legacy/
18 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | # Include README and documentation
 2 | include README.md
 3 | include LICENSE
 4 | include CHANGELOG.md
 5 | include requirements.txt
 6 | include pyproject.toml
 7 | 
 8 | # Include Python source files
 9 | recursive-include octanedb *.py
10 | recursive-include tests *.py
11 | recursive-include examples *.py
12 | 
13 | # Include data files
14 | recursive-include octanedb *.txt *.md *.rst
15 | 
16 | # Exclude build artifacts
17 | global-exclude *.pyc
18 | global-exclude *.pyo
19 | global-exclude *.pyd
20 | global-exclude __pycache__
21 | global-exclude *.so
22 | global-exclude .DS_Store
23 | global-exclude .git*
24 | global-exclude .hg*
25 | global-exclude .svn*
26 | global-exclude .tox*
27 | global-exclude .coverage
28 | global-exclude .pytest_cache*
29 | global-exclude .mypy_cache*
30 | global-exclude build/
31 | global-exclude dist/
32 | global-exclude *.egg-info/
33 | 


--------------------------------------------------------------------------------
/octanedb/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | OctaneDB - Lightning Fast Vector Database
 3 | 
 4 | A lightweight, high-performance Python vector database library that provides:
 5 | - Fast vector similarity search using HNSW indexing
 6 | - Automatic text embedding generation
 7 | - ChromaDB-compatible API
 8 | - Multiple storage modes (in-memory, persistent, hybrid)
 9 | - Advanced metadata filtering
10 | - Batch operations for improved performance
11 | """
12 | 
13 | from .core import OctaneDB
14 | from .collection import Collection
15 | from .index import HNSWIndex, FlatIndex
16 | from .storage import StorageManager
17 | from .query import QueryEngine
18 | from .utils import VectorUtils
19 | from .embeddings import TextEmbedder, ChromaCompatibleEmbedder, embed_texts
20 | 
21 | __version__ = "1.0.0"
22 | __author__ = "Rijin"
23 | __email__ = "rijinraj856@gmail.com"
24 | 
25 | __all__ = [
26 |     "OctaneDB",
27 |     "Collection", 
28 |     "HNSWIndex",
29 |     "FlatIndex",
30 |     "StorageManager",
31 |     "QueryEngine",
32 |     "VectorUtils",
33 |     "TextEmbedder",
34 |     "ChromaCompatibleEmbedder",
35 |     "embed_texts"
36 | ]
37 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 Rijin Raju
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Changelog
 2 | 
 3 | All notable changes to OctaneDB will be documented in this file.
 4 | 
 5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 7 | 
 8 | ## [1.0.1] - 2024-12-19
 9 | 
10 | ### Fixed
11 | - **Critical Search Bug Fix**: Fixed underlying HNSW index search issue that was causing empty results
12 | - **Layer Assignment**: Corrected `_get_random_layer()` method to use reasonable layer distribution (factor 1.0 instead of 16)
13 | - **Connection Creation**: Ensured proper connections are created between vectors during index build
14 | - **Search Fallback**: Added fallback search mechanism when HNSW search fails
15 | - **Bottom Layer Access**: Guaranteed all vectors are accessible in the bottom layer for searchability
16 | 
17 | ### Technical Improvements
18 | - Enhanced HNSW index implementation with better connection management
19 | - Improved search reliability and performance
20 | - Added comprehensive debugging and logging for index operations
21 | 
22 | ## [1.0.0] - 2024-12-19
23 | 
24 | ### Added
25 | - Initial release of OctaneDB
26 | - Core vector database functionality with HNSW indexing
27 | - Support for multiple distance metrics (cosine, euclidean, dot product, manhattan, chebyshev, jaccard)
28 | - Collection management with isolated namespaces
29 | - Comprehensive CRUD operations (Create, Read, Update, Delete)
30 | - Advanced metadata filtering with logical operators
31 | - HDF5-based vector storage and msgpack metadata serialization
32 | - In-memory, persistent, and hybrid storage modes
33 | - Batch operations for improved performance
34 | - Performance benchmarking tools
35 | - Milvus-compatible API design
36 | 
37 | ### Features
38 | - **HNSW Index**: Hierarchical Navigable Small World graph for fast approximate nearest neighbor search
39 | - **Vector Operations**: Optimized vector similarity search with configurable parameters
40 | - **Metadata Filtering**: Complex query engine supporting $eq, $ne, $gt, $gte, $lt, $lte, $in, $nin, $regex, $exists, $and, $or, $not, $text
41 | - **Storage Flexibility**: Choose between in-memory, file-based, or hybrid storage
42 | - **Performance**: 10x faster than existing solutions with optimized NumPy operations
43 | - **Lightweight**: Minimal dependencies, fast installation, and low memory footprint
44 | 
45 | ### Technical Details
46 | - **Dimensions**: Support for 128 to 4,096+ dimensions (practical range)
47 | - **Index Types**: HNSW (default), FlatIndex for exact search
48 | - **Distance Metrics**: Cosine, Euclidean, Dot Product, Manhattan, Chebyshev, Jaccard
49 | - **Storage Format**: HDF5 for vectors, msgpack for metadata
50 | - **Python Support**: Python 3.8+ compatibility
51 | - **Dependencies**: NumPy, h5py, msgpack, scipy
52 | 
53 | ### Performance
54 | - **Insertion**: 3,000+ vectors/second
55 | - **Search**: Sub-millisecond query response
56 | - **Memory**: Efficient memory usage with HDF5 compression
57 | - **Scalability**: Handles millions of vectors with configurable parameters
58 | 
59 | ### Documentation
60 | - Comprehensive README with usage examples
61 | - Milvus compatibility demonstration
62 | - Performance benchmarks and comparisons
63 | - Installation and configuration guides
64 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [build-system]
  2 | requires = ["setuptools>=61.0", "wheel"]
  3 | build-backend = "setuptools.build_meta"
  4 | 
  5 | [project]
  6 | name = "octanedb"
  7 | version = "1.0.1"
  8 | description = "A lightweight, high-performance Python vector database library with ChromaDB compatibility"
  9 | readme = "README.md"
 10 | license = {text = "MIT"}
 11 | authors = [
 12 |     {name = "Rijin", email = "rijinraj856@gmail.com"}
 13 | ]
 14 | maintainers = [
 15 |     {name = "Rijin", email = "rijinraj856@gmail.com"}
 16 | ]
 17 | keywords = [
 18 |     "vector-database", 
 19 |     "vector-search", 
 20 |     "embeddings", 
 21 |     "similarity-search", 
 22 |     "machine-learning", 
 23 |     "ai", 
 24 |     "chromadb-compatible",
 25 |     "hnsw",
 26 |     "fast",
 27 |     "lightweight"
 28 | ]
 29 | classifiers = [
 30 |     "Development Status :: 5 - Production/Stable",
 31 |     "Intended Audience :: Developers",
 32 |     "Intended Audience :: Science/Research",
 33 |     "License :: OSI Approved :: MIT License",
 34 |     "Operating System :: OS Independent",
 35 |     "Programming Language :: Python :: 3",
 36 |     "Programming Language :: Python :: 3.8",
 37 |     "Programming Language :: Python :: 3.9",
 38 |     "Programming Language :: Python :: 3.10",
 39 |     "Programming Language :: Python :: 3.11",
 40 |     "Programming Language :: Python :: 3.12",
 41 |     "Topic :: Database",
 42 |     "Topic :: Scientific/Engineering :: Artificial Intelligence",
 43 |     "Topic :: Software Development :: Libraries :: Python Modules",
 44 |     "Topic :: Text Processing :: Linguistic",
 45 | ]
 46 | requires-python = ">=3.8"
 47 | dependencies = [
 48 |     "numpy>=1.21.0",
 49 |     "h5py>=3.7.0",
 50 |     "msgpack>=1.0.0",
 51 |     "tqdm>=4.62.0",
 52 |     "sentence-transformers>=2.2.0",
 53 |     "transformers>=4.20.0",
 54 |     "torch>=1.12.0",
 55 | ]
 56 | 
 57 | [project.optional-dependencies]
 58 | dev = [
 59 |     "pytest>=7.0.0",
 60 |     "pytest-cov>=4.0.0",
 61 |     "black>=22.0.0",
 62 |     "isort>=5.10.0",
 63 |     "flake8>=5.0.0",
 64 |     "mypy>=1.0.0",
 65 | ]
 66 | docs = [
 67 |     "sphinx>=5.0.0",
 68 |     "sphinx-rtd-theme>=1.0.0",
 69 |     "myst-parser>=0.18.0",
 70 | ]
 71 | benchmark = [
 72 |     "matplotlib>=3.5.0",
 73 |     "pandas>=1.4.0",
 74 |     "seaborn>=0.11.0",
 75 |     "psutil>=5.8.0",
 76 | ]
 77 | all = [
 78 |     "octanedb[dev,docs,benchmark]"
 79 | ]
 80 | 
 81 | [project.urls]
 82 | Homepage = "https://github.com/RijinRaju/octanedb"
 83 | Documentation = "https://github.com/RijinRaju/octanedb#readme"
 84 | Repository = "https://github.com/RijinRaju/octanedb"
 85 | "Bug Tracker" = "https://github.com/RijinRaju/octanedb/issues"
 86 | "Source Code" = "https://github.com/RijinRaju/octanedb"
 87 | Changelog = "https://github.com/RijinRaju/octanedb/blob/main/CHANGELOG.md"
 88 | 
 89 | [project.scripts]
 90 | octanedb = "octanedb.cli:main"
 91 | 
 92 | [project.gui-scripts]
 93 | octanedb-gui = "octanedb.cli:main"
 94 | 
 95 | [tool.setuptools]
 96 | packages = ["octanedb"]
 97 | 
 98 | [tool.setuptools.package-data]
 99 | octanedb = ["*.txt", "*.md", "*.rst"]
100 | 
101 | [tool.setuptools.dynamic]
102 | # Version is now static in project section
103 | 
104 | [tool.black]
105 | line-length = 88
106 | target-version = ['py38']
107 | include = '\.pyi?$'
108 | extend-exclude = '''
109 | /(
110 |   # directories
111 |   \.eggs
112 |   | \.git
113 |   | \.hg
114 |   | \.mypy_cache
115 |   | \.tox
116 |   | \.venv
117 |   | build
118 |   | dist
119 | )/
120 | '''
121 | 
122 | [tool.isort]
123 | profile = "black"
124 | multi_line_output = 3
125 | line_length = 88
126 | known_first_party = ["octanedb"]
127 | 
128 | [tool.mypy]
129 | python_version = "3.8"
130 | warn_return_any = true
131 | warn_unused_configs = true
132 | disallow_untyped_defs = true
133 | disallow_incomplete_defs = true
134 | check_untyped_defs = true
135 | disallow_untyped_decorators = true
136 | no_implicit_optional = true
137 | warn_redundant_casts = true
138 | warn_unused_ignores = true
139 | warn_no_return = true
140 | warn_unreachable = true
141 | strict_equality = true
142 | 
143 | [tool.pytest.ini_options]
144 | minversion = "7.0"
145 | addopts = "-ra -q --strict-markers --strict-config"
146 | testpaths = ["tests"]
147 | python_files = ["test_*.py", "*_test.py"]
148 | python_classes = ["Test*"]
149 | python_functions = ["test_*"]
150 | markers = [
151 |     "slow: marks tests as slow (deselect with '-m \"not slow\"')",
152 |     "integration: marks tests as integration tests",
153 |     "unit: marks tests as unit tests",
154 | ]
155 | 
156 | [tool.coverage.run]
157 | source = ["octanedb"]
158 | omit = [
159 |     "*/tests/*",
160 |     "*/test_*",
161 |     "*/__pycache__/*",
162 |     "*/venv/*",
163 |     "*/env/*",
164 | ]
165 | 
166 | [tool.coverage.report]
167 | exclude_lines = [
168 |     "pragma: no cover",
169 |     "def __repr__",
170 |     "if self.debug:",
171 |     "if settings.DEBUG",
172 |     "raise AssertionError",
173 |     "raise NotImplementedError",
174 |     "if 0:",
175 |     "if __name__ == .__main__.:",
176 |     "class .*\\bProtocol\\):",
177 |     "@(abc\\.)?abstractmethod",
178 | ]
179 | 


--------------------------------------------------------------------------------
/install.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Installation script for OctaneDB vector database.
  4 | 
  5 | This script helps users install the library and its dependencies.
  6 | """
  7 | 
  8 | import subprocess
  9 | import sys
 10 | import os
 11 | from pathlib import Path
 12 | 
 13 | 
 14 | def run_command(command, description):
 15 |     """Run a command and handle errors."""
 16 |     print(f"🔄 {description}...")
 17 |     try:
 18 |         result = subprocess.run(command, shell=True, check=True, capture_output=True, text=True)
 19 |         print(f"✅ {description} completed successfully")
 20 |         return True
 21 |     except subprocess.CalledProcessError as e:
 22 |         print(f"❌ {description} failed:")
 23 |         print(f"   Error: {e}")
 24 |         if e.stdout:
 25 |             print(f"   Output: {e.stdout}")
 26 |         if e.stderr:
 27 |             print(f"   Error: {e.stderr}")
 28 |         return False
 29 | 
 30 | 
 31 | def check_python_version():
 32 |     """Check if Python version is compatible."""
 33 |     print("🐍 Checking Python version...")
 34 |     version = sys.version_info
 35 |     if version.major < 3 or (version.major == 3 and version.minor < 8):
 36 |         print(f"❌ Python 3.8+ required, found {version.major}.{version.minor}")
 37 |         return False
 38 |     
 39 |     print(f"✅ Python {version.major}.{version.minor}.{version.micro} is compatible")
 40 |     return True
 41 | 
 42 | 
 43 | def install_dependencies():
 44 |     """Install required dependencies."""
 45 |     print("\n📦 Installing dependencies...")
 46 |     
 47 |     # Upgrade pip
 48 |     if not run_command(f"{sys.executable} -m pip install --upgrade pip", "Upgrading pip"):
 49 |         return False
 50 |     
 51 |     # Install core dependencies
 52 |     dependencies = [
 53 |         "numpy>=1.21.0",
 54 |         "scipy>=1.7.0", 
 55 |         "h5py>=3.1.0",
 56 |         "msgpack>=1.0.0",
 57 |         "tqdm>=4.62.0"
 58 |     ]
 59 |     
 60 |     for dep in dependencies:
 61 |         if not run_command(f"{sys.executable} -m pip install {dep}", f"Installing {dep}"):
 62 |             return False
 63 |     
 64 |     # Install optional FAISS for enhanced performance
 65 |     print("🔧 Installing optional FAISS for enhanced performance...")
 66 |     try:
 67 |         subprocess.run(f"{sys.executable} -m pip install faiss-cpu>=1.7.0", 
 68 |                       shell=True, check=True, capture_output=True)
 69 |         print("✅ FAISS installed successfully")
 70 |     except subprocess.CalledProcessError:
 71 |         print("⚠️  FAISS installation failed, continuing without it")
 72 |         print("   You can install it later with: pip install faiss-cpu")
 73 |     
 74 |     return True
 75 | 
 76 | 
 77 | def install_development_dependencies():
 78 |     """Install development dependencies."""
 79 |     print("\n🛠️  Installing development dependencies...")
 80 |     
 81 |     dev_dependencies = [
 82 |         "pytest>=6.0",
 83 |         "pytest-cov>=2.10",
 84 |         "black>=21.0",
 85 |         "flake8>=3.8",
 86 |         "psutil>=5.8.0"
 87 |     ]
 88 |     
 89 |     for dep in dev_dependencies:
 90 |         if not run_command(f"{sys.executable} -m pip install {dep}", f"Installing {dep}"):
 91 |             print(f"⚠️  Failed to install {dep}, continuing...")
 92 |     
 93 |     return True
 94 | 
 95 | 
 96 | def install_octanedb():
 97 |     """Install OctaneDB in development mode."""
 98 |     print("\n🚀 Installing OctaneDB...")
 99 |     
100 |     # Check if we're in the right directory
101 |     if not Path("setup.py").exists():
102 |         print("❌ setup.py not found. Please run this script from the OctaneDB root directory.")
103 |         return False
104 |     
105 |     # Install in development mode
106 |     if not run_command(f"{sys.executable} -m pip install -e .", "Installing OctaneDB"):
107 |         return False
108 |     
109 |     return True
110 | 
111 | 
112 | def run_tests():
113 |     """Run the test suite to verify installation."""
114 |     print("\n🧪 Running tests to verify installation...")
115 |     
116 |     if not run_command(f"{sys.executable} -m pytest tests/ -v", "Running test suite"):
117 |         print("⚠️  Tests failed, but installation may still be successful")
118 |         return False
119 |     
120 |     return True
121 | 
122 | 
123 | def run_example():
124 |     """Run a simple example to verify functionality."""
125 |     print("\n🎯 Running example to verify functionality...")
126 |     
127 |     try:
128 |         # Simple import test
129 |         import octanedb
130 |         print("✅ OctaneDB imported successfully")
131 |         
132 |         # Create a simple database
133 |         db = octanedb.OctaneDB(dimension=64)
134 |         print("✅ Database created successfully")
135 |         
136 |         # Create collection
137 |         collection = db.create_collection("test")
138 |         print("✅ Collection created successfully")
139 |         
140 |         # Insert a vector
141 |         import numpy as np
142 |         vector = np.random.rand(64).astype(np.float32)
143 |         vector_id = db.insert(vector)
144 |         print(f"✅ Vector inserted with ID: {vector_id}")
145 |         
146 |         # Search
147 |         results = db.search(vector, k=1)
148 |         print(f"✅ Search completed, found {len(results)} results")
149 |         
150 |         print("✅ All functionality verified successfully!")
151 |         return True
152 |         
153 |     except Exception as e:
154 |         print(f"❌ Example failed: {e}")
155 |         return False
156 | 
157 | 
158 | def main():
159 |     """Main installation function."""
160 |     print("🚀 OctaneDB Vector Database Installation")
161 |     print("=" * 50)
162 |     
163 |     # Check Python version
164 |     if not check_python_version():
165 |         sys.exit(1)
166 |     
167 |     # Install dependencies
168 |     if not install_dependencies():
169 |         print("\n❌ Failed to install dependencies. Please check the errors above.")
170 |         sys.exit(1)
171 |     
172 |     # Install development dependencies (optional)
173 |     install_development_dependencies()
174 |     
175 |     # Install OctaneDB
176 |     if not install_octanedb():
177 |         print("\n❌ Failed to install OctaneDB. Please check the errors above.")
178 |         sys.exit(1)
179 |     
180 |     # Run tests
181 |     run_tests()
182 |     
183 |     # Run example
184 |     run_example()
185 |     
186 |     print("\n🎉 Installation completed successfully!")
187 |     print("\n📚 Next steps:")
188 |     print("   1. Check out the examples/ directory for usage examples")
189 |     print("   2. Read the README.md for detailed documentation")
190 |     print("   3. Run 'python examples/basic_usage.py' to see it in action")
191 |     print("   4. Run 'python examples/performance_benchmark.py' for performance tests")
192 |     
193 |     print("\n🔧 Development setup:")
194 |     print("   - Run 'python -m pytest tests/' to run tests")
195 |     print("   - Run 'black .' to format code")
196 |     print("   - Run 'flake8 .' to check code quality")
197 |     
198 |     print("\n💡 For help and support:")
199 |     print("   - Check the README.md file")
200 |     print("   - Look at the examples/ directory")
201 |     print("   - Run the test suite for verification")
202 | 
203 | 
204 | if __name__ == "__main__":
205 |     main()
206 | 


--------------------------------------------------------------------------------
/PYPI_PUBLISHING_GUIDE.md:
--------------------------------------------------------------------------------
  1 | # 🚀 OctaneDB PyPI Publishing Guide
  2 | 
  3 | This guide will walk you through publishing OctaneDB to the Python Package Index (PyPI) so users can install it with `pip install octanedb`.
  4 | 
  5 | ## 📋 Prerequisites
  6 | 
  7 | ### **1. PyPI Account Setup**
  8 | 
  9 | #### **Test PyPI (Recommended for first time)**
 10 | 1. Go to [Test PyPI](https://test.pypi.org/account/register/)
 11 | 2. Create an account with username and email
 12 | 3. Verify your email address
 13 | 4. Note your username and password
 14 | 
 15 | #### **Production PyPI**
 16 | 1. Go to [PyPI](https://pypi.org/account/register/)
 17 | 2. Create an account with username and email
 18 | 3. Verify your email address
 19 | 4. **Enable 2FA** (recommended for security)
 20 | 5. Note your username and password
 21 | 
 22 | ### **2. Install Required Tools**
 23 | 
 24 | ```bash
 25 | pip install twine build
 26 | ```
 27 | 
 28 | ### **3. Configure PyPI Credentials**
 29 | 
 30 | Create a `.pypirc` file in your home directory:
 31 | 
 32 | **Windows:**
 33 | ```bash
 34 | # Create file: C:\Users\YourUsername\.pypirc
 35 | ```
 36 | 
 37 | **macOS/Linux:**
 38 | ```bash
 39 | # Create file: ~/.pypirc
 40 | ```
 41 | 
 42 | **File content:**
 43 | ```ini
 44 | [distutils]
 45 | index-servers =
 46 |     testpypi
 47 |     pypi
 48 | 
 49 | [testpypi]
 50 | repository = https://test.pypi.org/legacy/
 51 | username = your_test_username
 52 | password = your_test_password
 53 | 
 54 | [pypi]
 55 | repository = https://upload.pypi.org/legacy/
 56 | username = your_production_username
 57 | password = your_production_password
 58 | ```
 59 | 
 60 | **⚠️ Security Note:** For production, consider using API tokens instead of passwords:
 61 | ```ini
 62 | [pypi]
 63 | repository = https://upload.pypi.org/legacy/
 64 | username = __token__
 65 | password = pypi-your_api_token_here
 66 | ```
 67 | 
 68 | ## 🚀 Publishing Workflow
 69 | 
 70 | ### **Option 1: Automated Publishing (Recommended)**
 71 | 
 72 | Use the provided publishing script:
 73 | 
 74 | ```bash
 75 | python publish_to_pypi.py
 76 | ```
 77 | 
 78 | The script will guide you through:
 79 | 1. ✅ Prerequisites check
 80 | 2. 🧹 Clean build files
 81 | 3. 🔨 Build package
 82 | 4. 🔍 Check package
 83 | 5. 🚀 Upload to PyPI
 84 | 6. ✅ Verify installation
 85 | 
 86 | ### **Option 2: Manual Publishing**
 87 | 
 88 | #### **Step 1: Update Package Information**
 89 | 
 90 | Before publishing, update these files:
 91 | 
 92 | 1. **`setup.py`** - Update author email and GitHub URL
 93 | 2. **`CHANGELOG.md`** - Ensure version matches
 94 | 3. **`README.md`** - Verify all links work
 95 | 
 96 | #### **Step 2: Clean Previous Builds**
 97 | 
 98 | ```bash
 99 | # Remove previous build artifacts
100 | rm -rf build/ dist/ *.egg-info/
101 | ```
102 | 
103 | #### **Step 3: Build Package**
104 | 
105 | ```bash
106 | # Build source distribution
107 | python -m build --sdist
108 | 
109 | # Build wheel distribution
110 | python -m build --wheel
111 | ```
112 | 
113 | #### **Step 4: Check Package**
114 | 
115 | ```bash
116 | # Verify package integrity
117 | twine check dist/*
118 | ```
119 | 
120 | #### **Step 5: Upload to Test PyPI (Recommended First)**
121 | 
122 | ```bash
123 | # Upload to Test PyPI
124 | twine upload --repository testpypi dist/*
125 | 
126 | # Test installation
127 | pip install --index-url https://test.pypi.org/simple/ octanedb
128 | ```
129 | 
130 | #### **Step 6: Upload to Production PyPI**
131 | 
132 | ```bash
133 | # Upload to Production PyPI
134 | twine upload dist/*
135 | 
136 | # Verify installation
137 | pip install octanedb
138 | ```
139 | 
140 | ## 📦 Package Structure
141 | 
142 | Your package structure should look like this:
143 | 
144 | ```
145 | OctaneDB/
146 | ├── octanedb/
147 | │   ├── __init__.py
148 | │   ├── core.py
149 | │   ├── collection.py
150 | │   ├── index.py
151 | │   ├── storage.py
152 | │   ├── query.py
153 | │   ├── utils.py
154 | │   └── cli.py
155 | ├── tests/
156 | ├── examples/
157 | ├── setup.py
158 | ├── MANIFEST.in
159 | ├── requirements.txt
160 | ├── README.md
161 | ├── CHANGELOG.md
162 | ├── LICENSE
163 | └── .pypirc
164 | ```
165 | 
166 | ## 🔧 Configuration Files
167 | 
168 | ### **setup.py**
169 | - ✅ Package metadata
170 | - ✅ Dependencies
171 | - ✅ Classifiers
172 | - ✅ Entry points (CLI)
173 | 
174 | ### **MANIFEST.in**
175 | - ✅ Include source files
176 | - ✅ Include documentation
177 | - ✅ Exclude build artifacts
178 | 
179 | ### **requirements.txt**
180 | - ✅ Core dependencies
181 | - ✅ Version constraints
182 | - ✅ No development dependencies
183 | 
184 | ## 🧪 Testing Before Publishing
185 | 
186 | ### **1. Local Installation Test**
187 | 
188 | ```bash
189 | # Install in development mode
190 | pip install -e .
191 | 
192 | # Test import
193 | python -c "import octanedb; print('✅ Import successful')"
194 | 
195 | # Test CLI
196 | octanedb info
197 | ```
198 | 
199 | ### **2. Package Build Test**
200 | 
201 | ```bash
202 | # Build package
203 | python -m build
204 | 
205 | # Check package
206 | twine check dist/*
207 | 
208 | # List package contents
209 | tar -tzf dist/*.tar.gz
210 | ```
211 | 
212 | ### **3. Test PyPI Upload**
213 | 
214 | ```bash
215 | # Upload to Test PyPI
216 | twine upload --repository testpypi dist/*
217 | 
218 | # Install from Test PyPI
219 | pip install --index-url https://test.pypi.org/simple/ octanedb
220 | 
221 | # Test functionality
222 | python -c "import octanedb; db = octanedb.OctaneDB(384); print('✅ Test successful')"
223 | ```
224 | 
225 | ## 🚀 Publishing Checklist
226 | 
227 | Before publishing, ensure:
228 | 
229 | - [ ] **Version updated** in `setup.py` and `CHANGELOG.md`
230 | - [ ] **Author information** updated in `setup.py`
231 | - [ ] **GitHub URLs** updated in `setup.py`
232 | - [ ] **Dependencies** correctly specified in `requirements.txt`
233 | - [ ] **README.md** is comprehensive and accurate
234 | - [ ] **Tests pass** locally
235 | - [ ] **Package builds** without errors
236 | - [ ] **Package checks** pass with `twine check`
237 | - [ ] **Test PyPI upload** successful
238 | - [ ] **Test installation** works from Test PyPI
239 | 
240 | ## 📊 After Publishing
241 | 
242 | ### **1. Monitor PyPI**
243 | - Check [PyPI project page](https://pypi.org/project/octanedb/)
244 | - Monitor download statistics
245 | - Check for any reported issues
246 | 
247 | ### **2. Update Documentation**
248 | - Add PyPI badge to README
249 | - Update installation instructions
250 | - Share on social media/GitHub
251 | 
252 | ### **3. Version Management**
253 | - Tag release in Git: `git tag v1.0.0`
254 | - Push tags: `git push --tags`
255 | - Plan next release
256 | 
257 | ## 🔄 Updating Package
258 | 
259 | For future updates:
260 | 
261 | 1. **Update version** in `setup.py` and `CHANGELOG.md`
262 | 2. **Update CHANGELOG.md** with new features/fixes
263 | 3. **Test locally** with new version
264 | 4. **Build and upload** new version
265 | 5. **Tag new release** in Git
266 | 
267 | ## 🆘 Troubleshooting
268 | 
269 | ### **Common Issues**
270 | 
271 | #### **Authentication Errors**
272 | ```bash
273 | # Check .pypirc file
274 | cat ~/.pypirc
275 | 
276 | # Test credentials
277 | twine check --repository testpypi dist/*
278 | ```
279 | 
280 | #### **Package Build Errors**
281 | ```bash
282 | # Clean build files
283 | rm -rf build/ dist/ *.egg-info/
284 | 
285 | # Rebuild
286 | python -m build
287 | ```
288 | 
289 | #### **Import Errors After Installation**
290 | ```bash
291 | # Check package structure
292 | pip show octanedb
293 | 
294 | # Verify files
295 | python -c "import octanedb; print(octanedb.__file__)"
296 | ```
297 | 
298 | #### **Version Conflicts**
299 | ```bash
300 | # Uninstall previous version
301 | pip uninstall octanedb
302 | 
303 | # Install new version
304 | pip install octanedb
305 | ```
306 | 
307 | ## 📚 Additional Resources
308 | 
309 | - [PyPI Help](https://pypi.org/help/)
310 | - [Python Packaging User Guide](https://packaging.python.org/)
311 | - [Twine Documentation](https://twine.readthedocs.io/)
312 | - [Test PyPI](https://test.pypi.org/)
313 | 
314 | ## 🎉 Success!
315 | 
316 | Once published, users can install OctaneDB with:
317 | 
318 | ```bash
319 | pip install octanedb
320 | ```
321 | 
322 | And use it in their code:
323 | 
324 | ```python
325 | from octanedb import OctaneDB
326 | 
327 | db = OctaneDB(dimension=384)
328 | collection = db.create_collection("my_collection")
329 | ```
330 | 
331 | ---
332 | 
333 | **Need Help?** Open an issue on GitHub or check the troubleshooting section above.
334 | 


--------------------------------------------------------------------------------
/octanedb/cli.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | OctaneDB Command Line Interface
  4 | Provides command-line access to OctaneDB functionality
  5 | """
  6 | 
  7 | import argparse
  8 | import sys
  9 | import json
 10 | from pathlib import Path
 11 | from typing import Optional, List
 12 | import numpy as np
 13 | 
 14 | from . import OctaneDB
 15 | 
 16 | 
 17 | def create_collection(args):
 18 |     """Create a new collection."""
 19 |     try:
 20 |         db = OctaneDB(dimension=args.dimension)
 21 |         collection = db.create_collection(args.name)
 22 |         print(f"Collection '{args.name}' created successfully with dimension {args.dimension}")
 23 |         return 0
 24 |     except Exception as e:
 25 |         print(f" Failed to create collection: {e}")
 26 |         return 1
 27 | 
 28 | 
 29 | def insert_vectors(args):
 30 |     """Insert vectors into a collection."""
 31 |     try:
 32 |         db = OctaneDB(dimension=args.dimension)
 33 |         db.create_collection(args.collection)
 34 |         db.use_collection(args.collection)
 35 |         
 36 |         # Generate sample vectors
 37 |         vectors = np.random.randn(args.count, args.dimension).astype(np.float32)
 38 |         metadata = [{"id": i, "description": f"Vector {i}"} for i in range(args.count)]
 39 |         
 40 |         inserted_ids = db.insert(vectors=vectors, metadata=metadata)
 41 |         print(f"Successfully inserted {len(inserted_ids)} vectors into collection '{args.collection}'")
 42 |         return 0
 43 |     except Exception as e:
 44 |         print(f" Failed to insert vectors: {e}")
 45 |         return 1
 46 | 
 47 | 
 48 | def search_vectors(args):
 49 |     """Search for similar vectors."""
 50 |     try:
 51 |         db = OctaneDB(dimension=args.dimension)
 52 |         db.create_collection(args.collection)
 53 |         db.use_collection(args.collection)
 54 |         
 55 |         # Insert some sample data first
 56 |         sample_vectors = np.random.randn(10, args.dimension).astype(np.float32)
 57 |         sample_metadata = [{"id": i, "description": f"Sample {i}"} for i in range(10)]
 58 |         db.insert(vectors=sample_vectors, metadata=sample_metadata)
 59 |         
 60 |         # Create query vector
 61 |         query_vector = np.random.randn(args.dimension).astype(np.float32)
 62 |         
 63 |         # Perform search
 64 |         results = db.search(query_vector=query_vector, k=args.k, include_metadata=True)
 65 |         
 66 |         print(f"🔍 Search results for collection '{args.collection}':")
 67 |         for i, (vector_id, distance, metadata) in enumerate(results):
 68 |             print(f"  {i+1}. ID: {vector_id}, Distance: {distance:.6f}")
 69 |             if metadata:
 70 |                 print(f"     Metadata: {metadata}")
 71 |         
 72 |         return 0
 73 |     except Exception as e:
 74 |         print(f"Failed to search vectors: {e}")
 75 |         return 1
 76 | 
 77 | 
 78 | def benchmark(args):
 79 |     """Run performance benchmarks."""
 80 |     try:
 81 |         print("OctaneDB Performance Benchmark")
 82 |         print("=" * 50)
 83 |         
 84 |         db = OctaneDB(dimension=args.dimension)
 85 |         collection = db.create_collection("benchmark")
 86 |         db.use_collection("benchmark")
 87 |         
 88 |         # Insert benchmark
 89 |         print(f"Inserting {args.count:,} vectors...")
 90 |         start_time = __import__('time').time()
 91 |         
 92 |         vectors = np.random.randn(args.count, args.dimension).astype(np.float32)
 93 |         metadata = [{"id": i} for i in range(args.count)]
 94 |         
 95 |         inserted_ids = db.insert(vectors=vectors, metadata=metadata)
 96 |         insert_time = __import__('time').time() - start_time
 97 |         
 98 |         print(f"Inserted {len(inserted_ids):,} vectors in {insert_time:.4f}s")
 99 |         print(f" Rate: {args.count/insert_time:.0f} vectors/second")
100 |         
101 |         # Search benchmark
102 |         print(f"\n Running search benchmark...")
103 |         start_time = __import__('time').time()
104 |         
105 |         query_vectors = np.random.randn(100, args.dimension).astype(np.float32)
106 |         total_results = 0
107 |         
108 |         for i, query_vector in enumerate(query_vectors):
109 |             results = db.search(query_vector=query_vector, k=10)
110 |             total_results += len(results)
111 |             
112 |             if (i + 1) % 20 == 0:
113 |                 print(f"   Processed {i + 1}/100 queries...")
114 |         
115 |         search_time = __import__('time').time() - start_time
116 |         
117 |         print(f"Search completed in {search_time:.4f}s")
118 |         print(f"   Rate: {100/search_time:.1f} queries/second")
119 |         print(f"   Total results: {total_results:,}")
120 |         
121 |         # Memory usage
122 |         stats = db.get_stats()
123 |         print(f"\n Database Statistics:")
124 |         print(f"   Collections: {stats['collection_count']}")
125 |         print(f"   Total vectors: {stats['total_vectors']}")
126 |         print(f"   Current collection: {stats['current_collection']}")
127 |         
128 |         return 0
129 |     except Exception as e:
130 |         print(f"Benchmark failed: {e}")
131 |         return 1
132 | 
133 | 
134 | def info(args):
135 |     """Show OctaneDB information."""
136 |     print("OctaneDB - Lightning Fast Vector Database")
137 |     print("=" * 50)
138 |     print(f"Version: 1.0.0")
139 |     print(f"Python: {sys.version}")
140 |     print(f"NumPy: {np.__version__}")
141 |     print(f"Author: Rijin")
142 |     print(f"License: MIT")
143 |     print(f"GitHub: https://github.com/yourusername/OctaneDB")
144 |     print("\nFeatures:")
145 |     print("   HNSW Indexing for fast similarity search")
146 |     print("   Multiple distance metrics (cosine, euclidean, dot)")
147 |     print("   HDF5 storage with msgpack metadata")
148 |     print("   In-memory, persistent, and hybrid storage")
149 |     print("   Milvus-compatible API")
150 |     print("   HNSW indexing for fast similarity search")
151 |     print("\nInstallation:")
152 |     print("  pip install octanedb")
153 |     print("\nQuick Start:")
154 |     print("  from octanedb import OctaneDB")
155 |     print("  db = OctaneDB(dimension=384)")
156 |     print("  collection = db.create_collection('my_collection')")
157 |     return 0
158 | 
159 | 
160 | def main():
161 |     """Main CLI entry point."""
162 |     parser = argparse.ArgumentParser(
163 |         description="OctaneDB - Lightning Fast Vector Database",
164 |         formatter_class=argparse.RawDescriptionHelpFormatter,
165 |         epilog="""
166 | Examples:
167 |   octanedb create --name documents --dimension 384
168 |   octanedb insert --collection documents --count 1000 --dimension 384
169 |   octanedb search --collection documents --k 5 --dimension 384
170 |   octanedb benchmark --count 10000 --dimension 384
171 |   octanedb info
172 |         """
173 |     )
174 |     
175 |     subparsers = parser.add_subparsers(dest='command', help='Available commands')
176 |     
177 |     # Create collection command
178 |     create_parser = subparsers.add_parser('create', help='Create a new collection')
179 |     create_parser.add_argument('--name', required=True, help='Collection name')
180 |     create_parser.add_argument('--dimension', type=int, default=384, help='Vector dimension')
181 |     create_parser.set_defaults(func=create_collection)
182 |     
183 |     # Insert vectors command
184 |     insert_parser = subparsers.add_parser('insert', help='Insert vectors into a collection')
185 |     insert_parser.add_argument('--collection', required=True, help='Collection name')
186 |     insert_parser.add_argument('--count', type=int, default=100, help='Number of vectors to insert')
187 |     insert_parser.add_argument('--dimension', type=int, default=384, help='Vector dimension')
188 |     insert_parser.set_defaults(func=insert_vectors)
189 |     
190 |     # Search command
191 |     search_parser = subparsers.add_parser('search', help='Search for similar vectors')
192 |     search_parser.add_argument('--collection', required=True, help='Collection name')
193 |     search_parser.add_argument('--k', type=int, default=5, help='Number of results to return')
194 |     search_parser.add_argument('--dimension', type=int, default=384, help='Vector dimension')
195 |     search_parser.set_defaults(func=search_vectors)
196 |     
197 |     # Benchmark command
198 |     benchmark_parser = subparsers.add_parser('benchmark', help='Run performance benchmarks')
199 |     benchmark_parser.add_argument('--count', type=int, default=10000, help='Number of vectors for benchmark')
200 |     benchmark_parser.add_argument('--dimension', type=int, default=384, help='Vector dimension')
201 |     benchmark_parser.set_defaults(func=benchmark)
202 |     
203 |     # Info command
204 |     info_parser = subparsers.add_parser('info', help='Show OctaneDB information')
205 |     info_parser.set_defaults(func=info)
206 |     
207 |     # Parse arguments
208 |     args = parser.parse_args()
209 |     
210 |     if not args.command:
211 |         parser.print_help()
212 |         return 1
213 |     
214 |     # Execute command
215 |     return args.func(args)
216 | 
217 | 
218 | if __name__ == "__main__":
219 |     sys.exit(main())
220 | 


--------------------------------------------------------------------------------
/publish_to_pypi.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | OctaneDB PyPI Publishing Script
  4 | Automates the process of publishing OctaneDB to PyPI
  5 | """
  6 | 
  7 | import os
  8 | import sys
  9 | import subprocess
 10 | import shutil
 11 | from pathlib import Path
 12 | import time
 13 | 
 14 | def run_command(command, description):
 15 |     """Run a command and handle errors."""
 16 |     print(f"🔄 {description}...")
 17 |     print(f"   Command: {command}")
 18 |     
 19 |     try:
 20 |         result = subprocess.run(command, shell=True, check=True, capture_output=True, text=True)
 21 |         print(f"   ✅ {description} completed successfully")
 22 |         if result.stdout:
 23 |             print(f"   Output: {result.stdout.strip()}")
 24 |         return True
 25 |     except subprocess.CalledProcessError as e:
 26 |         print(f"   ❌ {description} failed with exit code {e.returncode}")
 27 |         if e.stdout:
 28 |             print(f"   Stdout: {e.stdout.strip()}")
 29 |         if e.stderr:
 30 |             print(f"   Stderr: {e.stderr.strip()}")
 31 |         return False
 32 | 
 33 | def check_prerequisites():
 34 |     """Check if required tools are installed."""
 35 |     print("🔍 Checking prerequisites...")
 36 |     
 37 |     required_tools = [
 38 |         ("python", "Python interpreter"),
 39 |         ("pip", "Python package installer"),
 40 |         ("twine", "PyPI upload tool"),
 41 |         ("build", "Python build tool"),
 42 |     ]
 43 |     
 44 |     missing_tools = []
 45 |     
 46 |     for tool, description in required_tools:
 47 |         if shutil.which(tool) is None:
 48 |             missing_tools.append(f"{tool} ({description})")
 49 |     
 50 |     if missing_tools:
 51 |         print(f"❌ Missing required tools: {', '.join(missing_tools)}")
 52 |         print("\nInstall missing tools with:")
 53 |         print("  pip install twine build")
 54 |         return False
 55 |     
 56 |     print("✅ All prerequisites are available")
 57 |     return True
 58 | 
 59 | def clean_build_files():
 60 |     """Clean up previous build files."""
 61 |     print("🧹 Cleaning up previous build files...")
 62 |     
 63 |     build_dirs = ["build", "dist", "*.egg-info"]
 64 |     
 65 |     for pattern in build_dirs:
 66 |         for path in Path(".").glob(pattern):
 67 |             if path.is_dir():
 68 |                 shutil.rmtree(path)
 69 |                 print(f"   Removed: {path}")
 70 |             elif path.is_file():
 71 |                 path.unlink()
 72 |                 print(f"   Removed: {path}")
 73 |     
 74 |     print("✅ Build files cleaned up")
 75 | 
 76 | def build_package():
 77 |     """Build the Python package."""
 78 |     print("🔨 Building Python package...")
 79 |     
 80 |     # Build source distribution
 81 |     if not run_command("python -m build --sdist", "Building source distribution"):
 82 |         return False
 83 |     
 84 |     # Build wheel distribution
 85 |     if not run_command("python -m build --wheel", "Building wheel distribution"):
 86 |         return False
 87 |     
 88 |     print("✅ Package built successfully")
 89 |     return True
 90 | 
 91 | def check_package():
 92 |     """Check the built package for issues."""
 93 |     print("🔍 Checking built package...")
 94 |     
 95 |     # Check source distribution
 96 |     if not run_command("twine check dist/*", "Checking source distribution"):
 97 |         return False
 98 |     
 99 |     print("✅ Package check completed successfully")
100 |     return True
101 | 
102 | def test_upload(test_pypi=True):
103 |     """Upload to PyPI (test or production)."""
104 |     if test_pypi:
105 |         print("🚀 Uploading to Test PyPI...")
106 |         repository = "https://test.pypi.org/legacy/"
107 |         print("   This is a test upload - package will not be available on main PyPI")
108 |     else:
109 |         print("🚀 Uploading to Production PyPI...")
110 |         repository = "https://upload.pypi.org/legacy/"
111 |         print("   ⚠️  This will make the package publicly available!")
112 |     
113 |     # Confirm upload
114 |     if not test_pypi:
115 |         confirm = input("\n🤔 Are you sure you want to upload to production PyPI? (yes/no): ")
116 |         if confirm.lower() != "yes":
117 |             print("❌ Upload cancelled")
118 |             return False
119 |     
120 |     # Upload to PyPI
121 |     if not run_command(f"twine upload --repository {repository} dist/*", "Uploading package"):
122 |         return False
123 |     
124 |     print("✅ Package uploaded successfully!")
125 |     
126 |     if test_pypi:
127 |         print("\n📋 Test PyPI URL: https://test.pypi.org/project/octanedb/")
128 |         print("📋 Test installation: pip install --index-url https://test.pypi.org/simple/ octanedb")
129 |     else:
130 |         print("\n📋 Production PyPI URL: https://pypi.org/project/octanedb/")
131 |         print("📋 Production installation: pip install octanedb")
132 |     
133 |     return True
134 | 
135 | def verify_installation(test_pypi=True):
136 |     """Verify the package can be installed."""
137 |     print("🔍 Verifying package installation...")
138 |     
139 |     if test_pypi:
140 |         install_cmd = "pip install --index-url https://test.pypi.org/simple/ octanedb"
141 |     else:
142 |         install_cmd = "pip install octanedb"
143 |     
144 |     if not run_command(install_cmd, "Installing package"):
145 |         return False
146 |     
147 |     # Test import
148 |     if not run_command("python -c 'import octanedb; print(f\"✅ OctaneDB {octanedb.__version__} imported successfully\")'", "Testing import"):
149 |         return False
150 |     
151 |     print("✅ Package installation verified successfully")
152 |     return True
153 | 
154 | def show_next_steps():
155 |     """Show next steps after successful upload."""
156 |     print("\n🎉 Congratulations! OctaneDB has been published to PyPI!")
157 |     print("\n📋 Next Steps:")
158 |     print("1. 🐛 Monitor for any issues or bugs")
159 |     print("2. 📚 Update documentation if needed")
160 |     print("3. 🚀 Announce the release on social media/GitHub")
161 |     print("4. 📊 Monitor download statistics on PyPI")
162 |     print("5. 🔄 Plan next release and version bump")
163 |     
164 |     print("\n🔗 Useful Links:")
165 |     print(f"   - PyPI Project: https://pypi.org/project/octanedb/")
166 |     print(f"   - GitHub Repository: https://github.com/yourusername/OctaneDB")
167 |     print(f"   - Documentation: https://github.com/yourusername/OctaneDB#readme")
168 |     
169 |     print("\n💡 Tips:")
170 |     print("   - Users can now install with: pip install octanedb")
171 |     print("   - Consider adding badges to your README")
172 |     print("   - Monitor PyPI analytics for usage insights")
173 | 
174 | def main():
175 |     """Main publishing workflow."""
176 |     print("🚀 OctaneDB PyPI Publishing Script")
177 |     print("=" * 60)
178 |     print("This script will publish OctaneDB to PyPI")
179 |     print("=" * 60)
180 |     
181 |     # Check prerequisites
182 |     if not check_prerequisites():
183 |         print("❌ Prerequisites check failed. Please install missing tools.")
184 |         return 1
185 |     
186 |     # Ask user what they want to do
187 |     print("\n🤔 What would you like to do?")
188 |     print("1. Test upload to Test PyPI (recommended for first time)")
189 |     print("2. Upload to Production PyPI")
190 |     print("3. Just build and check package (no upload)")
191 |     
192 |     choice = input("\nEnter your choice (1-3): ").strip()
193 |     
194 |     if choice == "1":
195 |         # Test PyPI workflow
196 |         print("\n🧪 Starting Test PyPI workflow...")
197 |         
198 |         if not clean_build_files():
199 |             return 1
200 |         
201 |         if not build_package():
202 |             return 1
203 |         
204 |         if not check_package():
205 |             return 1
206 |         
207 |         if not test_upload(test_pypi=True):
208 |             return 1
209 |         
210 |         if not verify_installation(test_pypi=True):
211 |             return 1
212 |         
213 |         print("\n✅ Test PyPI workflow completed successfully!")
214 |         print("📋 You can now test the package installation from Test PyPI")
215 |         
216 |     elif choice == "2":
217 |         # Production PyPI workflow
218 |         print("\n🚀 Starting Production PyPI workflow...")
219 |         
220 |         if not clean_build_files():
221 |             return 1
222 |         
223 |         if not build_package():
224 |             return 1
225 |         
226 |         if not check_package():
227 |             return 1
228 |         
229 |         if not test_upload(test_pypi=False):
230 |             return 1
231 |         
232 |         if not verify_installation(test_pypi=False):
233 |             return 1
234 |         
235 |         show_next_steps()
236 |         
237 |     elif choice == "3":
238 |         # Just build and check
239 |         print("\n🔨 Building and checking package only...")
240 |         
241 |         if not clean_build_files():
242 |             return 1
243 |         
244 |         if not build_package():
245 |             return 1
246 |         
247 |         if not check_package():
248 |             return 1
249 |         
250 |         print("\n✅ Package built and checked successfully!")
251 |         print("📋 Package files are in the 'dist/' directory")
252 |         print("📋 You can manually upload them later with: twine upload dist/*")
253 |         
254 |     else:
255 |         print("❌ Invalid choice. Please run the script again.")
256 |         return 1
257 |     
258 |     return 0
259 | 
260 | if __name__ == "__main__":
261 |     sys.exit(main())
262 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <img width="3780" height="1890" alt="CTANE (1)" src="https://github.com/user-attachments/assets/a9a11642-685d-4545-9cc7-8d6468ff6fed" />
  2 | 
  3 | 
  4 | 
  5 | # OctaneDB - Lightweight & Fast Vector Database
  6 | 
  7 | [![PyPI version](https://badge.fury.io/py/octanedb.svg)](https://badge.fury.io/py/octanedb)
  8 | [![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/)
  9 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
 10 | 
 11 | **OctaneDB** is a lightweight, high-performance Python vector database library built with modern Python and optimized algorithms. It's perfect for AI/ML applications requiring fast similarity search with HNSW indexing and flexible storage options.
 12 | 
 13 | ## **Key Features**
 14 | 
 15 | ### **Performance**
 16 | - **Fast HNSW indexing** for approximate nearest neighbor search
 17 | - **Sub-millisecond** query response times for typical workloads
 18 | - **Efficient insertion** with configurable batch sizes
 19 | - **Optimized memory usage** with HDF5 compression
 20 | 
 21 | ### **Advanced Indexing**
 22 | - **HNSW (Hierarchical Navigable Small World)** for ultra-fast approximate search
 23 | - **FlatIndex** for exact similarity search
 24 | - **Configurable parameters** for performance tuning
 25 | - **Automatic index optimization**
 26 | 
 27 | ### **Text Embedding Support** 
 28 | 
 29 | - **Automatic text-to-vector conversion** using sentence-transformers
 30 | - **Multiple embedding models** (all-MiniLM-L6-v2, all-mpnet-base-v2, etc.)
 31 | - **GPU acceleration** support (CUDA)
 32 | - **Batch processing** for improved performance
 33 | 
 34 | ### **Flexible Storage**
 35 | - **In-memory** for maximum speed
 36 | - **Persistent** file-based storage
 37 | - **Hybrid** mode for best of both worlds
 38 | - **HDF5 format** for efficient compression
 39 | 
 40 | ### **Powerful Search**
 41 | - **Multiple distance metrics**: Cosine, Euclidean, Dot Product, Manhattan, Chebyshev, Jaccard
 42 | - **Advanced metadata filtering** with logical operators
 43 | - **Batch search** operations
 44 | - **Text-based search** with automatic embedding
 45 | 
 46 | 
 47 | 
 48 | ### **Installation**
 49 | 
 50 | ```bash
 51 | pip install octanedb
 52 | ```
 53 | 
 54 | ### **Basic Usage**
 55 | 
 56 | ```python
 57 | from octanedb import OctaneDB
 58 | 
 59 | # Initialize with text embedding support
 60 | db = OctaneDB(
 61 |     dimension=384,  # Will be auto-set by embedding model
 62 |     embedding_model="all-MiniLM-L6-v2"
 63 | )
 64 | 
 65 | # Create a collection
 66 | collection = db.create_collection("documents")
 67 | db.use_collection("documents")
 68 | 
 69 | # Add text documents (ChromaDB-compatible!)
 70 | result = db.add(
 71 |     ids=["doc1", "doc2"],
 72 |     documents=[
 73 |         "This is a document about pineapple",
 74 |         "This is a document about oranges"
 75 |     ],
 76 |     metadatas=[
 77 |         {"category": "tropical", "color": "yellow"},
 78 |         {"category": "citrus", "color": "orange"}
 79 |     ]
 80 | )
 81 | 
 82 | # Search by text query
 83 | results = db.search_text(
 84 |     query_text="fruit",
 85 |     k=2,
 86 |     filter="category == 'tropical'",
 87 |     include_metadata=True
 88 | )
 89 | 
 90 | for doc_id, distance, metadata in results:
 91 |     print(f"Document: {db.get_document(doc_id)}")
 92 |     print(f"Distance: {distance:.4f}")
 93 |     print(f"Metadata: {metadata}")
 94 | ```
 95 | 
 96 | ## **Text Embedding Examples**
 97 | 
 98 | ### **Working Basic Usage**
 99 | 
100 | Here's a complete working example that demonstrates OctaneDB's core functionality:
101 | 
102 | ```python
103 | from octanedb import OctaneDB
104 | 
105 | # Initialize database with text embeddings
106 | db = OctaneDB(
107 |     dimension=384,  # sentence-transformers default dimension
108 |     storage_mode="in-memory",
109 |     enable_text_embeddings=True,
110 |     embedding_model="all-MiniLM-L6-v2"  # Lightweight model
111 | )
112 | 
113 | # Create a collection
114 | db.create_collection("fruits")
115 | db.use_collection("fruits")
116 | 
117 | # Add some fruit documents
118 | fruits_data = [
119 |     {"id": "apple", "text": "Apple is a sweet and crunchy fruit that grows on trees.", "category": "temperate"},
120 |     {"id": "banana", "text": "Banana is a yellow tropical fruit rich in potassium.", "category": "tropical"},
121 |     {"id": "mango", "text": "Mango is a sweet tropical fruit with a large seed.", "category": "tropical"},
122 |     {"id": "orange", "text": "Orange is a citrus fruit with a bright orange peel.", "category": "citrus"}
123 | ]
124 | 
125 | for fruit in fruits_data:
126 |     db.add(
127 |         ids=[fruit["id"]],
128 |         documents=[fruit["text"]],
129 |         metadatas=[{"category": fruit["category"], "type": "fruit"}]
130 |     )
131 | 
132 | # Simple text search
133 | results = db.search_text(query_text="sweet", k=2, include_metadata=True)
134 | print("Sweet fruits:")
135 | for doc_id, distance, metadata in results:
136 |     print(f"  • {doc_id}: {metadata.get('document', 'N/A')[:50]}...")
137 | 
138 | # Text search with filter
139 | results = db.search_text(
140 |     query_text="fruit", 
141 |     k=2, 
142 |     filter="category == 'tropical'",
143 |     include_metadata=True
144 | )
145 | print("\nTropical fruits:")
146 | for doc_id, distance, metadata in results:
147 |     print(f"  • {doc_id}: {metadata.get('document', 'N/A')[:50]}...")
148 | ```
149 | 
150 | 
151 | ### **Advanced Text Operations**
152 | 
153 | ```python
154 | # Batch text search
155 | query_texts = ["machine learning", "artificial intelligence", "data science"]
156 | batch_results = db.search_text_batch(
157 |     query_texts=query_texts,
158 |     k=5,
159 |     include_metadata=True
160 | )
161 | 
162 | # Change embedding models
163 | db.change_embedding_model("all-mpnet-base-v2")  # Higher quality, 768 dimensions
164 | 
165 | # Get available models
166 | models = db.get_available_models()
167 | print(f"Available models: {models}")
168 | ```
169 | 
170 | ### **Custom Embeddings**
171 | 
172 | ```python
173 | # Use pre-computed embeddings
174 | custom_embeddings = np.random.randn(100, 384).astype(np.float32)
175 | result = db.add(
176 |     ids=[f"vec_{i}" for i in range(100)],
177 |     embeddings=custom_embeddings,
178 |     metadatas=[{"source": "custom"} for _ in range(100)]
179 | )
180 | ```
181 | 
182 | ## **Advanced Usage**
183 | 
184 | ### **Performance Tuning**
185 | 
186 | ```python
187 | # Optimize for speed vs. accuracy
188 | db = OctaneDB(
189 |     dimension=384,
190 |     m=8,              # Fewer connections = faster, less accurate
191 |     ef_construction=100,  # Lower = faster build
192 |     ef_search=50      # Lower = faster search
193 | )
194 | ```
195 | 
196 | ### **Storage Management**
197 | 
198 | ```python
199 | # Persistent storage
200 | db = OctaneDB(
201 |     dimension=384,
202 |     storage_path="./data",
203 |     embedding_model="all-MiniLM-L6-v2"
204 | )
205 | 
206 | # Save and load
207 | db.save("./my_database.h5")
208 | loaded_db = OctaneDB.load("./my_database.h5")
209 | ```
210 | 
211 | ### **Metadata Filtering**
212 | 
213 | ```python
214 | # Complex filters
215 | results = db.search_text(
216 |     query_text="technology",
217 |     k=10,
218 |     filter={
219 |         "$and": [
220 |             {"category": "tech"},
221 |             {"$or": [
222 |                 {"year": {"$gte": 2020}},
223 |                 {"priority": "high"}
224 |             ]}
225 |         ]
226 |     }
227 | )
228 | ```
229 | 
230 | ## **Troubleshooting**
231 | 
232 | ### **Common Issues**
233 | 
234 | 1. **Empty search results**: Make sure to call `include_metadata=True` in your search methods to get metadata back.
235 | 
236 | 2. **Query engine warnings**: The query engine for complex filters is under development. For now, use simple string filters like `"category == 'tropical'"`.
237 | 
238 | 3. **Index not built**: The index is automatically built when needed, but you can manually trigger it with `collection._build_index()` if needed.
239 | 
240 | 4. **Text embeddings not working**: Ensure you have `sentence-transformers` installed: `pip install sentence-transformers`
241 | 
242 | ### **Working Example**
243 | 
244 | ```python
245 | # This will work correctly:
246 | results = db.search_text(
247 |     query_text="fruit", 
248 |     k=2, 
249 |     filter="category == 'tropical'",
250 |     include_metadata=True  # Important!
251 | )
252 | 
253 | # Process results correctly:
254 | for doc_id, distance, metadata in results:
255 |     print(f"ID: {doc_id}, Distance: {distance:.4f}")
256 |     if metadata:
257 |         print(f"  Document: {metadata.get('document', 'N/A')}")
258 |         print(f"  Category: {metadata.get('category', 'N/A')}")
259 | ```
260 | 
261 | ## **Performance Benchmarks**
262 | 
263 | ### **OctaneDB Performance Characteristics**
264 | 
265 | **Test Environment:**
266 | - **Hardware**: Intel i5-1300H, 16GB RAM, SSD storage
267 | - **Dataset**: 100K vectors, 384 dimensions (float32)
268 | - **Index Type**: HNSW with default parameters (m=16, ef_construction=200, ef_search=100)
269 | - **Distance Metric**: Cosine similarity
270 | - **Storage Mode**: In-memory
271 | 
272 | **Performance Results:**
273 | 
274 | | Operation | Performance | Notes |
275 | |-----------|-------------|-------|
276 | | **Vector Insertion** | 2,800-3,500 vectors/sec | Single-threaded insertion with metadata |
277 | | **Index Build Time** | 45-60 seconds | HNSW index construction for 100K vectors |
278 | | **Single Query Search** | 0.5-2.0 milliseconds | k=10 nearest neighbors |
279 | | **Batch Search (100 queries)** | 150-200 queries/sec | k=10 per query |
280 | | **Memory Usage** | ~1.5GB | Including vectors, metadata, and HNSW index |
281 | | **Storage Efficiency** | ~15MB on disk | HDF5 compression for 100K vectors |
282 | 
283 | **Performance Tuning Options:**
284 | - **Faster Build**: Reduce `ef_construction` (trades accuracy for speed)
285 | - **Faster Search**: Reduce `ef_search` (trades accuracy for speed)
286 | - **Memory Optimization**: Use `m=8` instead of `m=16` (fewer connections)
287 | - **Storage Mode**: In-memory for speed, persistent for data persistence
288 | 
289 | **Benchmark Code:**
290 | ```bash
291 | # Run performance benchmarks using CLI
292 | octanedb benchmark --count 100000 --dimension 384
293 | 
294 | # Or use the comprehensive Python benchmarking script
295 | python benchmark_octanedb.py --vectors 100000 --dimension 384 --runs 5
296 | 
297 | # Or use the Python API directly
298 | from octanedb import OctaneDB
299 | db = OctaneDB(dimension=384)
300 | # ... run your own benchmarks
301 | ```
302 | 
303 | *Note: Performance varies based on hardware, dataset characteristics, and HNSW parameters. These numbers represent typical performance on the specified hardware configuration.*
304 | 
305 | ## **Architecture**
306 | 
307 | ```
308 | OctaneDB
309 | ├── Core (OctaneDB)
310 | │   ├── Collection Management
311 | │   ├── Text Embedding Engine
312 | │   └── Storage Manager
313 | ├── Collections
314 | │   ├── Vector Storage (HDF5)
315 | │   ├── Metadata Management
316 | │   └── Index Management
317 | ├── Indexing
318 | │   ├── HNSW Index
319 | │   ├── Flat Index
320 | │   └── Distance Metrics
321 | ├── Text Processing
322 | │   ├── Sentence Transformers
323 | │   ├── GPU Acceleration
324 | │   └── Batch Processing
325 | └── Storage
326 |     ├── HDF5 Vectors
327 |     ├── Msgpack Metadata
328 |     └── Compression
329 | ```
330 | 
331 | ## **Installation Options**
332 | 
333 | ### **Basic Installation**
334 | ```bash
335 | pip install octanedb
336 | ```
337 | 
338 | ### **With GPU Support**
339 | ```bash
340 | pip install octanedb[gpu]
341 | ```
342 | 
343 | ### **Development Installation**
344 | ```bash
345 | git clone https://github.com/RijinRaju/octanedb.git
346 | cd octanedb
347 | pip install -e .
348 | ```
349 | 
350 | ##  **Requirements**
351 | 
352 | - **Python**: 3.8+
353 | - **Core Dependencies**: NumPy, h5py, msgpack, tqdm
354 | - **Text Embeddings**: sentence-transformers, transformers, torch
355 | - **Optional**: CUDA for GPU acceleration, matplotlib, pandas, seaborn for benchmarking
356 | 
357 | 
358 | ##  **Contributing**
359 | 
360 | We welcome contributions! Please see our [Contributing Guide](CONTRIBUTING.md) for details.
361 | 
362 | ### **Development Setup**
363 | ```bash
364 | git clone https://github.com/RijinRaju/octanedb.git
365 | cd octanedb
366 | pip install -e ".[dev]"
367 | pytest tests/
368 | ```
369 | 
370 | ## **License**
371 | 
372 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
373 | 
374 | ## **Acknowledgments**
375 | 
376 | - **HNSW Algorithm**: Based on the Hierarchical Navigable Small World paper
377 | - **Sentence Transformers**: For text embedding capabilities
378 | - **HDF5**: For efficient vector storage
379 | - **NumPy**: For fast numerical operations
380 | 
381 | ## **Development Note**
382 | 
383 | **AI-Assisted Development**: This codebase was extensively developed with the assistance of Large Language Models (LLMs). The LLM assistance included:
384 | - Initial project structure
385 | - Core algorithm implementations (HNSW indexing, vector operations)
386 | - Documentation 
387 | - Performance optimization suggestions
388 | 
389 | 


--------------------------------------------------------------------------------
/octanedb/embeddings.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Text embedding generation for OctaneDB.
  3 | Provides automatic text-to-vector conversion using sentence-transformers.
  4 | """
  5 | 
  6 | import logging
  7 | from typing import List, Union, Optional, Dict, Any
  8 | import numpy as np
  9 | 
 10 | logger = logging.getLogger(__name__)
 11 | 
 12 | try:
 13 |     from sentence_transformers import SentenceTransformer
 14 |     SENTENCE_TRANSFORMERS_AVAILABLE = True
 15 | except ImportError:
 16 |     SENTENCE_TRANSFORMERS_AVAILABLE = False
 17 |     logger.warning("sentence-transformers not available. Install with: pip install sentence-transformers")
 18 | 
 19 | try:
 20 |     import torch
 21 |     TORCH_AVAILABLE = True
 22 | except ImportError:
 23 |     TORCH_AVAILABLE = False
 24 |     logger.warning("PyTorch not available. Install with: pip install torch")
 25 | 
 26 | 
 27 | class TextEmbedder:
 28 |     """
 29 |     Text embedding generator using sentence-transformers.
 30 |     
 31 |     Provides automatic text-to-vector conversion with support for:
 32 |     - Multiple embedding models
 33 |     - Batch processing
 34 |     - Custom model configuration
 35 |     - GPU acceleration (if available)
 36 |     """
 37 |     
 38 |     def __init__(
 39 |         self, 
 40 |         model_name: str = "all-MiniLM-L6-v2",
 41 |         device: Optional[str] = None,
 42 |         normalize_embeddings: bool = True,
 43 |         **kwargs
 44 |     ):
 45 |         """
 46 |         Initialize the text embedder.
 47 |         
 48 |         Args:
 49 |             model_name: Name of the sentence-transformers model to use
 50 |             device: Device to use ('cpu', 'cuda', or None for auto)
 51 |             normalize_embeddings: Whether to normalize embeddings
 52 |             **kwargs: Additional arguments for SentenceTransformer
 53 |         """
 54 |         if not SENTENCE_TRANSFORMERS_AVAILABLE:
 55 |             raise ImportError(
 56 |                 "sentence-transformers is required for text embedding. "
 57 |                 "Install with: pip install sentence-transformers"
 58 |             )
 59 |         
 60 |         self.model_name = model_name
 61 |         self.normalize_embeddings = normalize_embeddings
 62 |         
 63 |         # Auto-detect device if not specified
 64 |         if device is None:
 65 |             if TORCH_AVAILABLE and torch.cuda.is_available():
 66 |                 device = "cuda"
 67 |                 logger.info("CUDA detected, using GPU for embeddings")
 68 |             else:
 69 |                 device = "cpu"
 70 |                 logger.info("Using CPU for embeddings")
 71 |         
 72 |         self.device = device
 73 |         
 74 |         # Initialize the model
 75 |         logger.info(f"Loading sentence-transformers model: {model_name}")
 76 |         self.model = SentenceTransformer(
 77 |             model_name, 
 78 |             device=device,
 79 |             **kwargs
 80 |         )
 81 |         
 82 |         # Get model info
 83 |         self.dimension = self.model.get_sentence_embedding_dimension()
 84 |         logger.info(f"Model loaded successfully. Embedding dimension: {self.dimension}")
 85 |         
 86 |         # Performance tracking
 87 |         self._stats = {
 88 |             "embeddings_generated": 0,
 89 |             "total_tokens": 0,
 90 |             "total_time": 0.0
 91 |         }
 92 |     
 93 |     def embed_texts(
 94 |         self, 
 95 |         texts: Union[str, List[str]], 
 96 |         batch_size: int = 32,
 97 |         show_progress_bar: bool = False
 98 |     ) -> np.ndarray:
 99 |         """
100 |         Generate embeddings for text(s).
101 |         
102 |         Args:
103 |             texts: Single text string or list of text strings
104 |             batch_size: Batch size for processing
105 |             show_progress_bar: Whether to show progress bar
106 |             
107 |         Returns:
108 |             numpy array of embeddings with shape (n_texts, dimension)
109 |         """
110 |         import time
111 |         
112 |         # Ensure texts is a list
113 |         if isinstance(texts, str):
114 |             texts = [texts]
115 |         
116 |         if not texts:
117 |             return np.array([])
118 |         
119 |         start_time = time.time()
120 |         
121 |         try:
122 |             # Generate embeddings
123 |             embeddings = self.model.encode(
124 |                 texts,
125 |                 batch_size=batch_size,
126 |                 show_progress_bar=show_progress_bar,
127 |                 normalize_embeddings=self.normalize_embeddings,
128 |                 convert_to_numpy=True
129 |             )
130 |             
131 |             # Ensure correct shape
132 |             if embeddings.ndim == 1:
133 |                 embeddings = embeddings.reshape(1, -1)
134 |             
135 |             # Update stats
136 |             self._stats["embeddings_generated"] += len(texts)
137 |             self._stats["total_tokens"] += sum(len(text.split()) for text in texts)
138 |             self._stats["total_time"] += time.time() - start_time
139 |             
140 |             logger.debug(f"Generated {len(texts)} embeddings in {time.time() - start_time:.4f}s")
141 |             
142 |             return embeddings.astype(np.float32)
143 |             
144 |         except Exception as e:
145 |             logger.error(f"Error generating embeddings: {e}")
146 |             raise
147 |     
148 |     def embed_single(self, text: str) -> np.ndarray:
149 |         """
150 |         Generate embedding for a single text.
151 |         
152 |         Args:
153 |             text: Text string to embed
154 |             
155 |         Returns:
156 |             numpy array of embedding with shape (1, dimension)
157 |         """
158 |         return self.embed_texts([text])
159 |     
160 |     def get_model_info(self) -> Dict[str, Any]:
161 |         """Get information about the loaded model."""
162 |         return {
163 |             "model_name": self.model_name,
164 |             "dimension": self.dimension,
165 |             "device": self.device,
166 |             "normalize_embeddings": self.normalize_embeddings,
167 |             "stats": self._stats.copy()
168 |         }
169 |     
170 |     def change_model(self, model_name: str, **kwargs):
171 |         """
172 |         Change the embedding model.
173 |         
174 |         Args:
175 |             model_name: New model name
176 |             **kwargs: Additional arguments for SentenceTransformer
177 |         """
178 |         logger.info(f"Changing model from {self.model_name} to {model_name}")
179 |         
180 |         # Save current device
181 |         device = self.device
182 |         
183 |         # Initialize new model
184 |         self.model = SentenceTransformer(
185 |             model_name, 
186 |             device=device,
187 |             **kwargs
188 |         )
189 |         
190 |         self.model_name = model_name
191 |         self.dimension = self.model.get_sentence_embedding_dimension()
192 |         
193 |         logger.info(f"Model changed successfully. New dimension: {self.dimension}")
194 |     
195 |     def get_available_models(self) -> List[str]:
196 |         """Get list of available sentence-transformers models."""
197 |         # Common models that work well for most use cases
198 |         return [
199 |             "all-MiniLM-L6-v2",      # 384 dimensions, fast, good quality
200 |             "all-MiniLM-L12-v2",     # 384 dimensions, better quality, slower
201 |             "all-mpnet-base-v2",     # 768 dimensions, high quality
202 |             "all-MiniLM-L6-v2",      # 384 dimensions, multilingual
203 |             "paraphrase-MiniLM-L6-v2", # 384 dimensions, paraphrase-focused
204 |             "distiluse-base-multilingual-cased-v2", # 512 dimensions, multilingual
205 |         ]
206 | 
207 | 
208 | class ChromaCompatibleEmbedder(TextEmbedder):
209 |     """
210 |     ChromaDB-compatible text embedder.
211 |     
212 |     Provides the same API as ChromaDB for easy migration:
213 |     - add() method for documents
214 |     - Automatic ID generation
215 |     - Batch processing
216 |     """
217 |     
218 |     def __init__(
219 |         self, 
220 |         model_name: str = "all-MiniLM-L6-v2",
221 |         **kwargs
222 |     ):
223 |         """
224 |         Initialize ChromaDB-compatible embedder.
225 |         
226 |         Args:
227 |             model_name: Sentence-transformers model name
228 |             **kwargs: Additional arguments for TextEmbedder
229 |         """
230 |         super().__init__(model_name, **kwargs)
231 |         
232 |         # Document storage (for ChromaDB compatibility)
233 |         self._documents: Dict[str, str] = {}
234 |         self._next_id = 0
235 |     
236 |     def add(
237 |         self,
238 |         ids: Optional[Union[str, List[str]]] = None,
239 |         documents: Union[str, List[str]] = None,
240 |         metadatas: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
241 |         embeddings: Optional[Union[List[float], List[List[float]]]] = None
242 |     ) -> Dict[str, Any]:
243 |         """
244 |         Add documents to the collection (ChromaDB-compatible API).
245 |         
246 |         Args:
247 |             ids: Document IDs (auto-generated if not provided)
248 |             documents: Text documents to add
249 |             metadatas: Optional metadata for documents
250 |             embeddings: Pre-computed embeddings (optional)
251 |             
252 |         Returns:
253 |             Dictionary with 'ids' and 'embeddings' keys
254 |         """
255 |         # Ensure documents is a list
256 |         if isinstance(documents, str):
257 |             documents = [documents]
258 |         
259 |         if not documents:
260 |             raise ValueError("No documents provided")
261 |         
262 |         num_docs = len(documents)
263 |         
264 |         # Handle IDs
265 |         if ids is None:
266 |             # Auto-generate IDs
267 |             ids = [f"doc_{self._next_id + i}" for i in range(num_docs)]
268 |             self._next_id += num_docs
269 |         elif isinstance(ids, str):
270 |             ids = [ids]
271 |         
272 |         if len(ids) != num_docs:
273 |             raise ValueError("Number of IDs must match number of documents")
274 |         
275 |         # Handle metadata
276 |         if metadatas is None:
277 |             metadatas = [{} for _ in range(num_docs)]
278 |         elif isinstance(metadatas, dict):
279 |             metadatas = [metadatas for _ in range(num_docs)]
280 |         
281 |         if len(metadatas) != num_docs:
282 |             raise ValueError("Number of metadatas must match number of documents")
283 |         
284 |         # Store documents
285 |         for doc_id, doc_text in zip(ids, documents):
286 |             self._documents[doc_id] = doc_text
287 |         
288 |         # Generate or use provided embeddings
289 |         if embeddings is None:
290 |             # Generate embeddings automatically
291 |             embeddings = self.embed_texts(documents)
292 |         else:
293 |             # Convert to numpy array
294 |             embeddings = np.array(embeddings, dtype=np.float32)
295 |             if embeddings.ndim == 1:
296 |                 embeddings = embeddings.reshape(1, -1)
297 |         
298 |         # Prepare result
299 |         result = {
300 |             "ids": ids,
301 |             "embeddings": embeddings,
302 |             "documents": documents,
303 |             "metadatas": metadatas
304 |         }
305 |         
306 |         logger.info(f"Added {num_docs} documents with IDs: {ids}")
307 |         return result
308 |     
309 |     def get_document(self, doc_id: str) -> Optional[str]:
310 |         """Get a document by ID."""
311 |         return self._documents.get(doc_id)
312 |     
313 |     def list_documents(self) -> List[str]:
314 |         """List all document IDs."""
315 |         return list(self._documents.keys())
316 |     
317 |     def delete_document(self, doc_id: str) -> bool:
318 |         """Delete a document by ID."""
319 |         if doc_id in self._documents:
320 |             del self._documents[doc_id]
321 |             logger.info(f"Deleted document: {doc_id}")
322 |             return True
323 |         return False
324 |     
325 |     def clear_documents(self):
326 |         """Clear all documents."""
327 |         self._documents.clear()
328 |         self._next_id = 0
329 |         logger.info("Cleared all documents")
330 |     
331 |     def get_collection_info(self) -> Dict[str, Any]:
332 |         """Get information about the document collection."""
333 |         return {
334 |             "num_documents": len(self._documents),
335 |             "model_info": self.get_model_info(),
336 |             "document_ids": list(self._documents.keys())
337 |         }
338 | 
339 | 
340 | # Convenience function for quick embedding
341 | def embed_texts(
342 |     texts: Union[str, List[str]], 
343 |     model_name: str = "all-MiniLM-L6-v2",
344 |     **kwargs
345 | ) -> np.ndarray:
346 |     """
347 |     Quick function to generate embeddings for texts.
348 |     
349 |     Args:
350 |         texts: Text(s) to embed
351 |         model_name: Model to use
352 |         **kwargs: Additional arguments for TextEmbedder
353 |         
354 |     Returns:
355 |         numpy array of embeddings
356 |     """
357 |     embedder = TextEmbedder(model_name, **kwargs)
358 |     return embedder.embed_texts(texts)
359 | 


--------------------------------------------------------------------------------
/octanedb/query.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Query engine for advanced filtering and query operations.
  3 | """
  4 | 
  5 | import logging
  6 | import re
  7 | from typing import Dict, Any, List, Union, Callable
  8 | from datetime import datetime
  9 | 
 10 | logger = logging.getLogger(__name__)
 11 | 
 12 | 
 13 | class QueryEngine:
 14 |     """
 15 |     Query engine for advanced filtering and query operations on vector metadata.
 16 |     
 17 |     Supports:
 18 |     - Complex filter expressions
 19 |     - Range queries
 20 |     - Text search
 21 |     - Logical operators
 22 |     - Custom filter functions
 23 |     """
 24 |     
 25 |     def __init__(self):
 26 |         """Initialize the query engine."""
 27 |         self._operators = {
 28 |             "$eq": self._eq,
 29 |             "$ne": self._ne,
 30 |             "$gt": self._gt,
 31 |             "$gte": self._gte,
 32 |             "$lt": self._lt,
 33 |             "$lte": self._lte,
 34 |             "$in": self._in,
 35 |             "$nin": self._nin,
 36 |             "$regex": self._regex,
 37 |             "$exists": self._exists,
 38 |             "$and": self._and,
 39 |             "$or": self._nor,
 40 |             "$not": self._not,
 41 |             "$text": self._text_search
 42 |         }
 43 |         
 44 |         # Text search index for faster text queries
 45 |         self._text_index: Dict[str, Dict[str, List[int]]] = {}
 46 |         
 47 |         logger.info("Query engine initialized")
 48 |     
 49 |     def evaluate_filter(self, metadata: Dict[str, Any], filter_expr: Dict[str, Any]) -> bool:
 50 |         """
 51 |         Evaluate a filter expression against metadata.
 52 |         
 53 |         Args:
 54 |             metadata: Vector metadata to evaluate
 55 |             filter_expr: Filter expression
 56 |             
 57 |         Returns:
 58 |             True if metadata matches filter, False otherwise
 59 |         """
 60 |         try:
 61 |             return self._evaluate_expression(metadata, filter_expr)
 62 |         except Exception as e:
 63 |             logger.error(f"Error evaluating filter: {e}")
 64 |             return False
 65 |     
 66 |     def _evaluate_expression(self, metadata: Dict[str, Any], expr: Any) -> bool:
 67 |         """Evaluate a single expression."""
 68 |         if isinstance(expr, dict):
 69 |             # Handle operator expressions
 70 |             for operator, value in expr.items():
 71 |                 if operator in self._operators:
 72 |                     return self._operators[operator](metadata, value)
 73 |                 else:
 74 |                     # Direct field comparison
 75 |                     return self._eq(metadata, {operator: value})
 76 |         elif isinstance(expr, list):
 77 |             # Handle list expressions (AND logic)
 78 |             return all(self._evaluate_expression(metadata, item) for item in expr)
 79 |         else:
 80 |             # Simple value comparison
 81 |             return expr is True
 82 |         
 83 |         return False
 84 |     
 85 |     def _eq(self, metadata: Dict[str, Any], value: Any) -> bool:
 86 |         """Equality operator."""
 87 |         if isinstance(value, dict):
 88 |             field, expected_value = next(iter(value.items()))
 89 |             return metadata.get(field) == expected_value
 90 |         return False
 91 |     
 92 |     def _ne(self, metadata: Dict[str, Any], value: Any) -> bool:
 93 |         """Not equality operator."""
 94 |         if isinstance(value, dict):
 95 |             field, expected_value = next(iter(value.items()))
 96 |             return metadata.get(field) != expected_value
 97 |         return False
 98 |     
 99 |     def _gt(self, metadata: Dict[str, Any], value: Any) -> bool:
100 |         """Greater than operator."""
101 |         if isinstance(value, dict):
102 |             field, expected_value = next(iter(value.items()))
103 |             field_value = metadata.get(field)
104 |             if field_value is not None and expected_value is not None:
105 |                 return field_value > expected_value
106 |         return False
107 |     
108 |     def _gte(self, metadata: Dict[str, Any], value: Any) -> bool:
109 |         """Greater than or equal operator."""
110 |         if isinstance(value, dict):
111 |             field, expected_value = next(iter(value.items()))
112 |             field_value = metadata.get(field)
113 |             if field_value is not None and expected_value is not None:
114 |                 return field_value >= expected_value
115 |         return False
116 |     
117 |     def _lt(self, metadata: Dict[str, Any], value: Any) -> bool:
118 |         """Less than operator."""
119 |         if isinstance(value, dict):
120 |             field, expected_value = next(iter(value.items()))
121 |             field_value = metadata.get(field)
122 |             if field_value is not None and expected_value is not None:
123 |                 return field_value < expected_value
124 |         return False
125 |     
126 |     def _lte(self, metadata: Dict[str, Any], value: Any) -> bool:
127 |         """Less than or equal operator."""
128 |         if isinstance(value, dict):
129 |             field, expected_value = next(iter(value.items()))
130 |             field_value = metadata.get(field)
131 |             if field_value is not None and expected_value is not None:
132 |                 return field_value <= expected_value
133 |         return False
134 |     
135 |     def _in(self, metadata: Dict[str, Any], value: Any) -> bool:
136 |         """In operator."""
137 |         if isinstance(value, dict):
138 |             field, expected_values = next(iter(value.items()))
139 |             if isinstance(expected_values, list):
140 |                 field_value = metadata.get(field)
141 |                 return field_value in expected_values
142 |         return False
143 |     
144 |     def _nin(self, metadata: Dict[str, Any], value: Any) -> bool:
145 |         """Not in operator."""
146 |         if isinstance(value, dict):
147 |             field, expected_values = next(iter(value.items()))
148 |             if isinstance(expected_values, list):
149 |                 field_value = metadata.get(field)
150 |                 return field_value not in expected_values
151 |         return False
152 |     
153 |     def _regex(self, metadata: Dict[str, Any], value: Any) -> bool:
154 |         """Regex operator."""
155 |         if isinstance(value, dict):
156 |             field, pattern = next(iter(value.items()))
157 |             field_value = metadata.get(field)
158 |             if isinstance(field_value, str) and isinstance(pattern, str):
159 |                 try:
160 |                     return bool(re.search(pattern, field_value, re.IGNORECASE))
161 |                 except re.error:
162 |                     return False
163 |         return False
164 |     
165 |     def _exists(self, metadata: Dict[str, Any], value: Any) -> bool:
166 |         """Exists operator."""
167 |         if isinstance(value, dict):
168 |             field, should_exist = next(iter(value.items()))
169 |             field_exists = field in metadata
170 |             return field_exists == should_exist
171 |         return False
172 |     
173 |     def _and(self, metadata: Dict[str, Any], value: Any) -> bool:
174 |         """AND operator."""
175 |         if isinstance(value, list):
176 |             return all(self._evaluate_expression(metadata, item) for item in value)
177 |         return False
178 |     
179 |     def _or(self, metadata: Dict[str, Any], value: Any) -> bool:
180 |         """OR operator."""
181 |         if isinstance(value, list):
182 |             return any(self._evaluate_expression(metadata, item) for item in value)
183 |         return False
184 |     
185 |     def _nor(self, metadata: Dict[str, Any], value: Any) -> bool:
186 |         """NOR operator."""
187 |         if isinstance(value, list):
188 |             return not any(self._evaluate_expression(metadata, item) for item in value)
189 |         return False
190 |     
191 |     def _not(self, metadata: Dict[str, Any], value: Any) -> bool:
192 |         """NOT operator."""
193 |         return not self._evaluate_expression(metadata, value)
194 |     
195 |     def _text_search(self, metadata: Dict[str, Any], value: Any) -> bool:
196 |         """Text search operator."""
197 |         if isinstance(value, dict):
198 |             field, search_text = next(iter(value.items()))
199 |             field_value = metadata.get(field)
200 |             if isinstance(field_value, str) and isinstance(search_text, str):
201 |                 return search_text.lower() in field_value.lower()
202 |         return False
203 |     
204 |     def build_text_index(self, collection_name: str, metadata_list: List[Dict[str, Any]], vector_ids: List[int]) -> None:
205 |         """
206 |         Build a text search index for faster text queries.
207 |         
208 |         Args:
209 |             collection_name: Name of the collection
210 |             metadata_list: List of metadata dictionaries
211 |             vector_ids: Corresponding vector IDs
212 |         """
213 |         if collection_name not in self._text_index:
214 |             self._text_index[collection_name] = {}
215 |         
216 |         # Index text fields
217 |         for metadata, vector_id in zip(metadata_list, vector_ids):
218 |             for field, value in metadata.items():
219 |                 if isinstance(value, str):
220 |                     if field not in self._text_index[collection_name]:
221 |                         self._text_index[collection_name][field] = {}
222 |                     
223 |                     # Create word index
224 |                     words = value.lower().split()
225 |                     for word in words:
226 |                         if word not in self._text_index[collection_name][field]:
227 |                             self._text_index[collection_name][field][word] = []
228 |                         self._text_index[collection_name][field][word].append(vector_id)
229 |         
230 |         logger.info(f"Text index built for collection '{collection_name}'")
231 |     
232 |     def search_text(self, collection_name: str, field: str, query: str) -> List[int]:
233 |         """
234 |         Search text using the built index.
235 |         
236 |         Args:
237 |             collection_name: Name of the collection
238 |             field: Field to search in
239 |             query: Search query
240 |             
241 |         Returns:
242 |             List of vector IDs matching the query
243 |         """
244 |         if collection_name not in self._text_index:
245 |             return []
246 |         
247 |         if field not in self._text_index[collection_name]:
248 |             return []
249 |         
250 |         query_words = query.lower().split()
251 |         matching_ids = set()
252 |         
253 |         for word in query_words:
254 |             if word in self._text_index[collection_name][field]:
255 |                 matching_ids.update(self._text_index[collection_name][field][word])
256 |         
257 |         return list(matching_ids)
258 |     
259 |     def create_aggregation_pipeline(self, pipeline: List[Dict[str, Any]]) -> Callable:
260 |         """
261 |         Create an aggregation pipeline for complex data processing.
262 |         
263 |         Args:
264 |             pipeline: List of aggregation stages
265 |             
266 |         Returns:
267 |             Callable function that executes the pipeline
268 |         """
269 |         def execute_pipeline(data: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
270 |             result = data
271 |             
272 |             for stage in pipeline:
273 |                 if "$match" in stage:
274 |                     result = [item for item in result if self.evaluate_filter(item, stage["$match"])]
275 |                 elif "$project" in stage:
276 |                     result = self._project_fields(result, stage["$project"])
277 |                 elif "$sort" in stage:
278 |                     result = self._sort_data(result, stage["$sort"])
279 |                 elif "$limit" in stage:
280 |                     result = result[:stage["$limit"]]
281 |                 elif "$skip" in stage:
282 |                     result = result[stage["$skip"]:]
283 |                 elif "$group" in stage:
284 |                     result = self._group_data(result, stage["$group"])
285 |                 elif "$count" in stage:
286 |                     result = [{"count": len(result)}]
287 |             
288 |             return result
289 |         
290 |         return execute_pipeline
291 |     
292 |     def _project_fields(self, data: List[Dict[str, Any]], projection: Dict[str, Any]) -> List[Dict[str, Any]]:
293 |         """Project specific fields from data."""
294 |         result = []
295 |         for item in data:
296 |             projected_item = {}
297 |             for field, include in projection.items():
298 |                 if include:
299 |                     if field in item:
300 |                         projected_item[field] = item[field]
301 |             result.append(projected_item)
302 |         return result
303 |     
304 |     def _sort_data(self, data: List[Dict[str, Any]], sort_spec: Dict[str, Any]) -> List[Dict[str, Any]]:
305 |         """Sort data based on sort specification."""
306 |         def sort_key(item):
307 |             keys = []
308 |             for field, direction in sort_spec.items():
309 |                 value = item.get(field, 0)
310 |                 if direction == -1:
311 |                     value = -value if isinstance(value, (int, float)) else value
312 |                 keys.append(value)
313 |             return tuple(keys)
314 |         
315 |         return sorted(data, key=sort_key)
316 |     
317 |     def _group_data(self, data: List[Dict[str, Any]], group_spec: Dict[str, Any]) -> List[Dict[str, Any]]:
318 |         """Group data based on group specification."""
319 |         groups = {}
320 |         
321 |         for item in data:
322 |             group_key = tuple(item.get(field) for field in group_spec["_id"].values())
323 |             
324 |             if group_key not in groups:
325 |                 groups[group_key] = {"_id": dict(zip(group_spec["_id"].keys(), group_key))}
326 |             
327 |             # Apply aggregations
328 |             for field, agg in group_spec.items():
329 |                 if field == "_id":
330 |                     continue
331 |                 
332 |                 if agg["$sum"] in item:
333 |                     if field not in groups[group_key]:
334 |                         groups[group_key][field] = 0
335 |                     groups[group_key][field] += item[agg["$sum"]]
336 |                 elif agg["$avg"] in item:
337 |                     if field not in groups[group_key]:
338 |                         groups[group_key][field] = {"sum": 0, "count": 0}
339 |                     groups[group_key][field]["sum"] += item[agg["$avg"]]
340 |                     groups[group_key][field]["count"] += 1
341 |                 elif agg["$min"] in item:
342 |                     if field not in groups[group_key]:
343 |                         groups[group_key][field] = float('inf')
344 |                     groups[group_key][field] = min(groups[group_key][field], item[agg["$min"]])
345 |                 elif agg["$max"] in item:
346 |                     if field not in groups[group_key]:
347 |                         groups[group_key][field] = float('-inf')
348 |                     groups[group_key][field] = max(groups[group_key][field], item[agg["$max"]])
349 |         
350 |         # Convert averages
351 |         for group in groups.values():
352 |             for field, value in group.items():
353 |                 if isinstance(value, dict) and "sum" in value and "count" in value:
354 |                     group[field] = value["sum"] / value["count"] if value["count"] > 0 else 0
355 |         
356 |         return list(groups.values())
357 |     
358 |     def get_query_stats(self) -> Dict[str, Any]:
359 |         """Get query engine statistics."""
360 |         stats = {
361 |             "text_indexes": len(self._text_index),
362 |             "total_indexed_fields": sum(len(fields) for fields in self._text_index.values()),
363 |             "total_indexed_words": sum(
364 |                 sum(len(words) for words in fields.values())
365 |                 for fields in self._text_index.values()
366 |             )
367 |         }
368 |         
369 |         return stats
370 |     
371 |     def clear_text_index(self, collection_name: str = None) -> None:
372 |         """Clear text search index."""
373 |         if collection_name:
374 |             if collection_name in self._text_index:
375 |                 del self._text_index[collection_name]
376 |                 logger.info(f"Text index cleared for collection '{collection_name}'")
377 |         else:
378 |             self._text_index.clear()
379 |             logger.info("All text indexes cleared")
380 | 


--------------------------------------------------------------------------------
/benchmark_octanedb.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | OctaneDB Performance Benchmarking Script
  4 | 
  5 | This script provides comprehensive performance measurements for OctaneDB
  6 | with different configurations and dataset sizes.
  7 | 
  8 | Usage:
  9 |     python benchmark_octanedb.py [--vectors N] [--dimension D] [--runs R]
 10 | 
 11 | """
 12 | 
 13 | import argparse
 14 | import time
 15 | import statistics
 16 | import numpy as np
 17 | import psutil
 18 | import os
 19 | from typing import Dict, List, Tuple
 20 | import json
 21 | 
 22 | try:
 23 |     from octanedb import OctaneDB
 24 | except ImportError:
 25 |     print(" OctaneDB not installed. Install with: pip install octanedb")
 26 |     exit(1)
 27 | 
 28 | 
 29 | class OctaneDBBenchmark:
 30 |     """Comprehensive benchmarking for OctaneDB."""
 31 |     
 32 |     def __init__(self, dimension: int = 384, storage_mode: str = "in-memory"):
 33 |         self.dimension = dimension
 34 |         self.storage_mode = storage_mode
 35 |         self.db = None
 36 |         self.results = {}
 37 |         
 38 |     def setup_database(self):
 39 |         """Initialize the database and collection."""
 40 |         print(f" Setting up OctaneDB (dimension={self.dimension}, storage={self.storage_mode})")
 41 |         
 42 |         self.db = OctaneDB(
 43 |             dimension=self.dimension,
 44 |             storage_mode=self.storage_mode
 45 |         )
 46 |         
 47 |         # Create benchmark collection
 48 |         collection = self.db.create_collection("benchmark")
 49 |         self.db.use_collection("benchmark")
 50 |         
 51 |         print(" Database setup complete")
 52 |         
 53 |     def measure_memory_usage(self) -> Dict[str, float]:
 54 |         """Measure current memory usage."""
 55 |         process = psutil.Process(os.getpid())
 56 |         memory_info = process.memory_info()
 57 |         
 58 |         return {
 59 |             "rss_mb": memory_info.rss / 1024 / 1024,  # Resident Set Size
 60 |             "vms_mb": memory_info.vms / 1024 / 1024,  # Virtual Memory Size
 61 |             "percent": process.memory_percent()
 62 |         }
 63 |         
 64 |     def benchmark_insertion(self, num_vectors: int, batch_size: int = 1000) -> Dict[str, float]:
 65 |         """Benchmark vector insertion performance."""
 66 |         print(f"Benchmarking insertion of {num_vectors:,} vectors...")
 67 |         
 68 |         # Generate random vectors
 69 |         vectors = np.random.randn(num_vectors, self.dimension).astype(np.float32)
 70 |         metadata = [{"id": i, "benchmark": True} for i in range(num_vectors)]
 71 |         
 72 |         # Measure insertion time
 73 |         start_time = time.time()
 74 |         start_memory = self.measure_memory_usage()
 75 |         
 76 |         # Insert in batches
 77 |         inserted_count = 0
 78 |         for i in range(0, num_vectors, batch_size):
 79 |             end_idx = min(i + batch_size, num_vectors)
 80 |             batch_vectors = vectors[i:end_idx]
 81 |             batch_metadata = metadata[i:end_idx]
 82 |             
 83 |             self.db.insert(
 84 |                 vectors=batch_vectors.tolist(),
 85 |                 metadata=batch_metadata,
 86 |                 ids=list(range(i, end_idx))
 87 |             )
 88 |             
 89 |             inserted_count += len(batch_vectors)
 90 |             
 91 |             # Progress update
 92 |             if (i + batch_size) % (batch_size * 10) == 0 or end_idx == num_vectors:
 93 |                 elapsed = time.time() - start_time
 94 |                 rate = inserted_count / elapsed
 95 |                 print(f"   Inserted {inserted_count:,}/{num_vectors:,} vectors ({rate:.0f} vectors/sec)")
 96 |         
 97 |         end_time = time.time()
 98 |         end_memory = self.measure_memory_usage()
 99 |         
100 |         total_time = end_time - start_time
101 |         insertion_rate = num_vectors / total_time
102 |         
103 |         return {
104 |             "total_time": total_time,
105 |             "insertion_rate": insertion_rate,
106 |             "batch_size": batch_size,
107 |             "start_memory_mb": start_memory["rss_mb"],
108 |             "end_memory_mb": end_memory["rss_mb"],
109 |             "memory_increase_mb": end_memory["rss_mb"] - start_memory["rss_mb"]
110 |         }
111 |         
112 |     def benchmark_index_build(self) -> Dict[str, float]:
113 |         """Benchmark HNSW index construction."""
114 |         print("Benchmarking index construction...")
115 |         
116 |         start_time = time.time()
117 |         start_memory = self.measure_memory_usage()
118 |         
119 |         # Build the index
120 |         self.db._current_collection._build_index()
121 |         
122 |         end_time = time.time()
123 |         end_memory = self.measure_memory_usage()
124 |         
125 |         build_time = end_time - start_time
126 |         
127 |         # Get index statistics
128 |         stats = self.db._current_collection.get_stats()
129 |         index_stats = stats.get("index_stats", {})
130 |         
131 |         return {
132 |             "build_time": build_time,
133 |             "start_memory_mb": start_memory["rss_mb"],
134 |             "end_memory_mb": end_memory["rss_mb"],
135 |             "memory_increase_mb": end_memory["rss_mb"] - start_memory["rss_mb"],
136 |             "index_stats": index_stats
137 |         }
138 |         
139 |     def benchmark_search(self, num_queries: int = 100, k: int = 10) -> Dict[str, float]:
140 |         """Benchmark search performance."""
141 |         print(f" Benchmarking search ({num_queries} queries, k={k})...")
142 |         
143 |         # Generate random query vectors
144 |         query_vectors = np.random.randn(num_queries, self.dimension).astype(np.float32)
145 |         
146 |         search_times = []
147 |         results_counts = []
148 |         
149 |         start_time = time.time()
150 |         
151 |         for i, query_vector in enumerate(query_vectors):
152 |             query_start = time.time()
153 |             
154 |             results = self.db.search(query_vector=query_vector, k=k)
155 |             
156 |             query_time = time.time() - query_start
157 |             search_times.append(query_time * 1000)  # Convert to milliseconds
158 |             results_counts.append(len(results))
159 |             
160 |             # Progress update
161 |             if (i + 1) % 20 == 0:
162 |                 print(f"   Processed {i + 1}/{num_queries} queries...")
163 |         
164 |         total_time = time.time() - start_time
165 |         
166 |         return {
167 |             "total_time": total_time,
168 |             "queries_per_second": num_queries / total_time,
169 |             "avg_search_time_ms": statistics.mean(search_times),
170 |             "median_search_time_ms": statistics.median(search_times),
171 |             "min_search_time_ms": min(search_times),
172 |             "max_search_time_ms": max(search_times),
173 |             "search_time_std_ms": statistics.stdev(search_times) if len(search_times) > 1 else 0,
174 |             "avg_results_count": statistics.mean(results_counts),
175 |             "search_times_ms": search_times
176 |         }
177 |         
178 |     def benchmark_batch_search(self, num_queries: int = 100, k: int = 10) -> Dict[str, float]:
179 |         """Benchmark batch search performance."""
180 |         print(f" Benchmarking batch search ({num_queries} queries, k={k})...")
181 |         
182 |         # Generate random query vectors
183 |         query_vectors = np.random.randn(num_queries, self.dimension).astype(np.float32)
184 |         
185 |         start_time = time.time()
186 |         
187 |         # Perform batch search
188 |         batch_results = self.db.search_batch(query_vectors=query_vectors, k=k)
189 |         
190 |         total_time = time.time() - start_time
191 |         
192 |         # Count total results
193 |         total_results = sum(len(results) for results in batch_results)
194 |         
195 |         return {
196 |             "total_time": total_time,
197 |             "queries_per_second": num_queries / total_time,
198 |             "total_results": total_results,
199 |             "avg_results_per_query": total_results / num_queries
200 |         }
201 |         
202 |     def run_comprehensive_benchmark(self, num_vectors: int, num_runs: int = 3) -> Dict:
203 |         """Run comprehensive benchmark with multiple runs."""
204 |         print(f" Starting comprehensive benchmark ({num_runs} runs)")
205 |         print(f"   Dataset: {num_vectors:,} vectors, {self.dimension} dimensions")
206 |         print(f"   Storage: {self.storage_mode}")
207 |         print("=" * 60)
208 |         
209 |         all_results = {
210 |             "insertion": [],
211 |             "index_build": [],
212 |             "search": [],
213 |             "batch_search": [],
214 |             "system_info": self._get_system_info()
215 |         }
216 |         
217 |         for run in range(num_runs):
218 |             print(f"\n Run {run + 1}/{num_runs}")
219 |             print("-" * 40)
220 |             
221 |             # Reset database for each run
222 |             self.setup_database()
223 |             
224 |             # Benchmark insertion
225 |             insertion_result = self.benchmark_insertion(num_vectors)
226 |             all_results["insertion"].append(insertion_result)
227 |             
228 |             # Benchmark index build
229 |             index_result = self.benchmark_index_build()
230 |             all_results["index_build"].append(index_result)
231 |             
232 |             # Benchmark search
233 |             search_result = self.benchmark_search(100, 10)
234 |             all_results["search"].append(search_result)
235 |             
236 |             # Benchmark batch search
237 |             batch_result = self.benchmark_batch_search(100, 10)
238 |             all_results["batch_search"].append(batch_result)
239 |             
240 |             print(f"Run {run + 1} completed")
241 |         
242 |         # Calculate aggregate results
243 |         self.results = self._calculate_aggregate_results(all_results)
244 |         
245 |         return self.results
246 |         
247 |     def _get_system_info(self) -> Dict:
248 |         """Get system information."""
249 |         return {
250 |             "python_version": f"{os.sys.version_info.major}.{os.sys.version_info.minor}.{os.sys.version_info.micro}",
251 |             "numpy_version": np.__version__,
252 |             "cpu_count": os.cpu_count(),
253 |             "memory_gb": psutil.virtual_memory().total / 1024 / 1024 / 1024,
254 |             "platform": os.name
255 |         }
256 |         
257 |     def _calculate_aggregate_results(self, all_results: Dict) -> Dict:
258 |         """Calculate aggregate results from multiple runs."""
259 |         aggregated = {}
260 |         
261 |         for operation, runs in all_results.items():
262 |             if operation == "system_info":
263 |                 aggregated[operation] = runs
264 |                 continue
265 |                 
266 |             aggregated[operation] = {}
267 |             
268 |             # Get all numeric values for each metric
269 |             for metric in runs[0].keys():
270 |                 if isinstance(runs[0][metric], (int, float)):
271 |                     values = [run[metric] for run in runs]
272 |                     aggregated[operation][f"{metric}_mean"] = statistics.mean(values)
273 |                     aggregated[operation][f"{metric}_median"] = statistics.median(values)
274 |                     aggregated[operation][f"{metric}_min"] = min(values)
275 |                     aggregated[operation][f"{metric}_max"] = max(values)
276 |                     if len(values) > 1:
277 |                         aggregated[operation][f"{metric}_std"] = statistics.stdev(values)
278 |                     else:
279 |                         aggregated[operation][f"{metric}_std"] = 0
280 |                 else:
281 |                     # For non-numeric values, just take the first run
282 |                     aggregated[operation][metric] = runs[0][metric]
283 |         
284 |         return aggregated
285 |         
286 |     def print_results(self):
287 |         """Print formatted benchmark results."""
288 |         if not self.results:
289 |             print(" No results to display. Run benchmark first.")
290 |             return
291 |             
292 |         print("\n" + "=" * 60)
293 |         print(" OCTANEDB BENCHMARK RESULTS")
294 |         print("=" * 60)
295 |         
296 |         # System Information
297 |         print(f"\n System Information:")
298 |         sys_info = self.results["system_info"]
299 |         print(f"   Python: {sys_info['python_version']}")
300 |         print(f"   NumPy: {sys_info['numpy_version']}")
301 |         print(f"   CPU Cores: {sys_info['cpu_count']}")
302 |         print(f"   Memory: {sys_info['memory_gb']:.1f} GB")
303 |         print(f"   Platform: {sys_info['platform']}")
304 |         
305 |         # Insertion Results
306 |         print(f"\nInsertion Performance:")
307 |         insertion = self.results["insertion"]
308 |         print(f"   Rate: {insertion['insertion_rate_mean']:.0f} ± {insertion['insertion_rate_std']:.0f} vectors/sec")
309 |         print(f"   Total Time: {insertion['total_time_mean']:.2f} ± {insertion['total_time_std']:.2f} seconds")
310 |         print(f"   Memory Increase: {insertion['memory_increase_mb_mean']:.1f} ± {insertion['memory_increase_mb_std']:.1f} MB")
311 |         
312 |         # Index Build Results
313 |         print(f"\n Index Build Performance:")
314 |         index_build = self.results["index_build"]
315 |         print(f"   Build Time: {index_build['build_time_mean']:.2f} ± {index_build['build_time_std']:.2f} seconds")
316 |         print(f"   Memory Increase: {index_build['memory_increase_mb_mean']:.1f} ± {index_build['memory_increase_mb_std']:.1f} MB")
317 |         
318 |         # Search Results
319 |         print(f"\n Search Performance:")
320 |         search = self.results["search"]
321 |         print(f"   Single Query: {search['avg_search_time_ms_mean']:.2f} ± {search['search_time_std_ms_mean']:.2f} ms")
322 |         print(f"   Queries/sec: {search['queries_per_second_mean']:.1f} ± {search['queries_per_second_std']:.1f}")
323 |         print(f"   Min Time: {search['min_search_time_ms_mean']:.2f} ms")
324 |         print(f"   Max Time: {search['max_search_time_ms_mean']:.2f} ms")
325 |         
326 |         # Batch Search Results
327 |         print(f"\n Batch Search Performance:")
328 |         batch_search = self.results["batch_search"]
329 |         print(f"   Batch Queries/sec: {batch_search['queries_per_second_mean']:.1f} ± {batch_search['queries_per_second_std']:.1f}")
330 |         print(f"   Total Time: {batch_search['total_time_mean']:.2f} ± {batch_search['total_time_std']:.2f} seconds")
331 |         
332 |         print("\n" + "=" * 60)
333 |         
334 |     def save_results(self, filename: str = "octanedb_benchmark_results.json"):
335 |         """Save results to JSON file."""
336 |         if not self.results:
337 |             print(" No results to save. Run benchmark first.")
338 |             return
339 |             
340 |         with open(filename, 'w') as f:
341 |             json.dump(self.results, f, indent=2, default=str)
342 |         
343 |         print(f" Results saved to {filename}")
344 | 
345 | 
346 | def main():
347 |     """Main benchmark execution."""
348 |     parser = argparse.ArgumentParser(
349 |         description="OctaneDB Performance Benchmarking",
350 |         formatter_class=argparse.RawDescriptionHelpFormatter,
351 |         epilog="""
352 | Examples:
353 |     python benchmark_octanedb.py --vectors 10000 --dimension 128
354 |     python benchmark_octanedb.py --vectors 100000 --dimension 384 --runs 5
355 |     python benchmark_octanedb.py --vectors 50000 --dimension 512 --storage persistent
356 |         """
357 |     )
358 |     
359 |     parser.add_argument(
360 |         "--vectors", "-v",
361 |         type=int,
362 |         default=10000,
363 |         help="Number of vectors to benchmark (default: 10000)"
364 |     )
365 |     
366 |     parser.add_argument(
367 |         "--dimension", "-d",
368 |         type=int,
369 |         default=384,
370 |         help="Vector dimension (default: 384)"
371 |     )
372 |     
373 |     parser.add_argument(
374 |         "--runs", "-r",
375 |         type=int,
376 |         default=3,
377 |         help="Number of benchmark runs (default: 3)"
378 |     )
379 |     
380 |     parser.add_argument(
381 |         "--storage", "-s",
382 |         choices=["in-memory", "persistent", "hybrid"],
383 |         default="in-memory",
384 |         help="Storage mode (default: in-memory)"
385 |     )
386 |     
387 |     parser.add_argument(
388 |         "--save", "-o",
389 |         action="store_true",
390 |         help="Save results to JSON file"
391 |     )
392 |     
393 |     args = parser.parse_args()
394 |     
395 |     print(" OctaneDB Performance Benchmarking")
396 |     print("=" * 50)
397 |     
398 |     # Run benchmark
399 |     benchmark = OctaneDBBenchmark(
400 |         dimension=args.dimension,
401 |         storage_mode=args.storage
402 |     )
403 |     
404 |     try:
405 |         results = benchmark.run_comprehensive_benchmark(
406 |             num_vectors=args.vectors,
407 |             num_runs=args.runs
408 |         )
409 |         
410 |         # Display results
411 |         benchmark.print_results()
412 |         
413 |         # Save results if requested
414 |         if args.save:
415 |             benchmark.save_results()
416 |             
417 |     except KeyboardInterrupt:
418 |         print("\n Benchmark interrupted by user")
419 |         return 1
420 |     except Exception as e:
421 |         print(f"\n Benchmark failed: {e}")
422 |         import traceback
423 |         traceback.print_exc()
424 |         return 1
425 |     
426 |     print("\n Benchmark completed successfully!")
427 |     return 0
428 | 
429 | 
430 | if __name__ == "__main__":
431 |     exit(main())
432 | 


--------------------------------------------------------------------------------
/octanedb/utils.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Utility classes for vector operations, distance metrics, and helper functions.
  3 | """
  4 | 
  5 | import numpy as np
  6 | import logging
  7 | from typing import List, Tuple, Optional, Dict, Any, Union
  8 | from enum import Enum
  9 | 
 10 | logger = logging.getLogger(__name__)
 11 | 
 12 | 
 13 | class DistanceMetrics(Enum):
 14 |     """Supported distance metrics."""
 15 |     COSINE = "cosine"
 16 |     EUCLIDEAN = "euclidean"
 17 |     DOT = "dot"
 18 |     MANHATTAN = "manhattan"
 19 |     CHEBYSHEV = "chebyshev"
 20 |     JACCARD = "jaccard"
 21 | 
 22 | 
 23 | class VectorUtils:
 24 |     """
 25 |     Utility class for vector operations and distance computations.
 26 |     
 27 |     Provides:
 28 |     - Fast distance calculations
 29 |     - Vector normalization
 30 |     - Batch operations
 31 |     - Performance optimizations
 32 |     """
 33 |     
 34 |     def __init__(self, distance_metric: str = "cosine"):
 35 |         """
 36 |         Initialize vector utilities.
 37 |         
 38 |         Args:
 39 |             distance_metric: Default distance metric to use
 40 |         """
 41 |         self.distance_metric = distance_metric
 42 |         self._distance_func = self._get_distance_function()
 43 |         
 44 |         logger.info(f"Vector utilities initialized with metric: {distance_metric}")
 45 |     
 46 |     def _get_distance_function(self):
 47 |         """Get the appropriate distance function."""
 48 |         if self.distance_metric == "cosine":
 49 |             return self.cosine_distance
 50 |         elif self.distance_metric == "euclidean":
 51 |             return self.euclidean_distance
 52 |         elif self.distance_metric == "dot":
 53 |             return self.dot_distance
 54 |         elif self.distance_metric == "manhattan":
 55 |             return self.manhattan_distance
 56 |         elif self.distance_metric == "chebyshev":
 57 |             return self.chebyshev_distance
 58 |         elif self.distance_metric == "jaccard":
 59 |             return self.jaccard_distance
 60 |         else:
 61 |             raise ValueError(f"Unsupported distance metric: {self.distance_metric}")
 62 |     
 63 |     @staticmethod
 64 |     def cosine_distance(a: np.ndarray, b: np.ndarray) -> float:
 65 |         """
 66 |         Compute cosine distance between two vectors.
 67 |         
 68 |         Args:
 69 |             a: First vector
 70 |             b: Second vector
 71 |             
 72 |         Returns:
 73 |             Cosine distance (0 = identical, 2 = opposite)
 74 |         """
 75 |         dot_product = np.dot(a, b)
 76 |         norm_a = np.linalg.norm(a)
 77 |         norm_b = np.linalg.norm(b)
 78 |         
 79 |         if norm_a == 0 or norm_b == 0:
 80 |             return 1.0
 81 |         
 82 |         cosine_similarity = dot_product / (norm_a * norm_b)
 83 |         return 1.0 - cosine_similarity
 84 |     
 85 |     @staticmethod
 86 |     def euclidean_distance(a: np.ndarray, b: np.ndarray) -> float:
 87 |         """
 88 |         Compute Euclidean distance between two vectors.
 89 |         
 90 |         Args:
 91 |             a: First vector
 92 |             b: Second vector
 93 |             
 94 |         Returns:
 95 |             Euclidean distance
 96 |         """
 97 |         return np.linalg.norm(a - b)
 98 |     
 99 |     @staticmethod
100 |     def dot_distance(a: np.ndarray, b: np.ndarray) -> float:
101 |         """
102 |         Compute negative dot product distance.
103 |         
104 |         Args:
105 |             a: First vector
106 |             b: Second vector
107 |             
108 |         Returns:
109 |             Negative dot product
110 |         """
111 |         return -np.dot(a, b)
112 |     
113 |     @staticmethod
114 |     def manhattan_distance(a: np.ndarray, b: np.ndarray) -> float:
115 |         """
116 |         Compute Manhattan (L1) distance between two vectors.
117 |         
118 |         Args:
119 |             a: First vector
120 |             b: Second vector
121 |             
122 |         Returns:
123 |             Manhattan distance
124 |         """
125 |         return np.sum(np.abs(a - b))
126 |     
127 |     @staticmethod
128 |     def chebyshev_distance(a: np.ndarray, b: np.ndarray) -> float:
129 |         """
130 |         Compute Chebyshev (L∞) distance between two vectors.
131 |         
132 |         Args:
133 |             a: First vector
134 |             b: Second vector
135 |             
136 |         Returns:
137 |             Chebyshev distance
138 |         """
139 |         return np.max(np.abs(a - b))
140 |     
141 |     @staticmethod
142 |     def jaccard_distance(a: np.ndarray, b: np.ndarray) -> float:
143 |         """
144 |         Compute Jaccard distance between two vectors.
145 |         
146 |         Args:
147 |             a: First vector
148 |             b: Second vector
149 |             
150 |         Returns:
151 |             Jaccard distance
152 |         """
153 |         intersection = np.sum(np.minimum(a, b))
154 |         union = np.sum(np.maximum(a, b))
155 |         
156 |         if union == 0:
157 |             return 1.0
158 |         
159 |         return 1.0 - (intersection / union)
160 |     
161 |     def compute_distance(self, a: np.ndarray, b: np.ndarray) -> float:
162 |         """
163 |         Compute distance between two vectors using the configured metric.
164 |         
165 |         Args:
166 |             a: First vector
167 |             b: Second vector
168 |             
169 |         Returns:
170 |             Distance value
171 |         """
172 |         return self._distance_func(a, b)
173 |     
174 |     def compute_distances_batch(
175 |         self, 
176 |         query: np.ndarray, 
177 |         vectors: np.ndarray
178 |     ) -> np.ndarray:
179 |         """
180 |         Compute distances between a query vector and multiple vectors.
181 |         
182 |         Args:
183 |             query: Query vector
184 |             vectors: Array of vectors to compare against
185 |             
186 |         Returns:
187 |             Array of distances
188 |         """
189 |         if self.distance_metric == "cosine":
190 |             # Optimized cosine distance for batch computation
191 |             query_norm = np.linalg.norm(query)
192 |             if query_norm == 0:
193 |                 return np.ones(len(vectors))
194 |             
195 |             # Normalize query
196 |             query_normalized = query / query_norm
197 |             
198 |             # Compute norms for all vectors
199 |             vector_norms = np.linalg.norm(vectors, axis=1)
200 |             
201 |             # Avoid division by zero
202 |             valid_norms = vector_norms > 0
203 |             distances = np.ones(len(vectors))
204 |             
205 |             if np.any(valid_norms):
206 |                 # Compute dot products
207 |                 dot_products = np.dot(vectors[valid_norms], query_normalized)
208 |                 
209 |                 # Compute cosine similarities
210 |                 cosine_similarities = dot_products / vector_norms[valid_norms]
211 |                 
212 |                 # Convert to distances
213 |                 distances[valid_norms] = 1.0 - cosine_similarities
214 |             
215 |             return distances
216 |         
217 |         elif self.distance_metric == "euclidean":
218 |             # Optimized Euclidean distance for batch computation
219 |             return np.linalg.norm(vectors - query, axis=1)
220 |         
221 |         else:
222 |             # Fallback to individual computation
223 |             distances = np.zeros(len(vectors))
224 |             for i, vector in enumerate(vectors):
225 |                 distances[i] = self._distance_func(query, vector)
226 |             return distances
227 |     
228 |     @staticmethod
229 |     def normalize_vector(vector: np.ndarray) -> np.ndarray:
230 |         """
231 |         Normalize a vector to unit length.
232 |         
233 |         Args:
234 |             vector: Vector to normalize
235 |             
236 |         Returns:
237 |             Normalized vector
238 |         """
239 |         norm = np.linalg.norm(vector)
240 |         if norm == 0:
241 |             return vector
242 |         return vector / norm
243 |     
244 |     @staticmethod
245 |     def normalize_vectors(vectors: np.ndarray) -> np.ndarray:
246 |         """
247 |         Normalize multiple vectors to unit length.
248 |         
249 |         Args:
250 |             vectors: Array of vectors to normalize
251 |             
252 |         Returns:
253 |             Array of normalized vectors
254 |         """
255 |         norms = np.linalg.norm(vectors, axis=1, keepdims=True)
256 |         # Avoid division by zero
257 |         norms[norms == 0] = 1.0
258 |         return vectors / norms
259 |     
260 |     @staticmethod
261 |     def compute_centroid(vectors: np.ndarray) -> np.ndarray:
262 |         """
263 |         Compute the centroid (mean) of multiple vectors.
264 |         
265 |         Args:
266 |             vectors: Array of vectors
267 |             
268 |         Returns:
269 |             Centroid vector
270 |         """
271 |         if len(vectors) == 0:
272 |             raise ValueError("Cannot compute centroid of empty vector set")
273 |         
274 |         return np.mean(vectors, axis=0)
275 |     
276 |     @staticmethod
277 |     def compute_variance(vectors: np.ndarray) -> np.ndarray:
278 |         """
279 |         Compute the variance of multiple vectors.
280 |         
281 |         Args:
282 |             vectors: Array of vectors
283 |             
284 |         Returns:
285 |             Variance vector
286 |         """
287 |         if len(vectors) < 2:
288 |             raise ValueError("Need at least 2 vectors to compute variance")
289 |         
290 |         return np.var(vectors, axis=0)
291 |     
292 |     @staticmethod
293 |     def find_nearest_neighbors(
294 |         query: np.ndarray,
295 |         vectors: np.ndarray,
296 |         k: int,
297 |         distance_func: callable = None
298 |     ) -> Tuple[np.ndarray, np.ndarray]:
299 |         """
300 |         Find k nearest neighbors using brute force search.
301 |         
302 |         Args:
303 |             query: Query vector
304 |             vectors: Array of vectors to search in
305 |             k: Number of neighbors to find
306 |             distance_func: Distance function to use
307 |             
308 |         Returns:
309 |             Tuple of (indices, distances)
310 |         """
311 |         if distance_func is None:
312 |             distance_func = VectorUtils.cosine_distance
313 |         
314 |         # Compute distances
315 |         distances = np.array([
316 |             distance_func(query, vector) for vector in vectors
317 |         ])
318 |         
319 |         # Find k nearest neighbors
320 |         if k >= len(vectors):
321 |             indices = np.arange(len(vectors))
322 |         else:
323 |             indices = np.argpartition(distances, k)[:k]
324 |         
325 |         # Sort by distance
326 |         sorted_indices = indices[np.argsort(distances[indices])]
327 |         sorted_distances = distances[sorted_indices]
328 |         
329 |         return sorted_indices, sorted_distances
330 |     
331 |     @staticmethod
332 |     def compute_similarity_matrix(vectors: np.ndarray, metric: str = "cosine") -> np.ndarray:
333 |         """
334 |         Compute similarity matrix between all pairs of vectors.
335 |         
336 |         Args:
337 |             vectors: Array of vectors
338 |             metric: Similarity metric to use
339 |             
340 |         Returns:
341 |             Similarity matrix
342 |         """
343 |         n = len(vectors)
344 |         similarity_matrix = np.zeros((n, n))
345 |         
346 |         if metric == "cosine":
347 |             # Normalize vectors
348 |             normalized_vectors = VectorUtils.normalize_vectors(vectors)
349 |             
350 |             # Compute cosine similarities
351 |             similarity_matrix = np.dot(normalized_vectors, normalized_vectors.T)
352 |             
353 |             # Ensure diagonal is 1.0 (self-similarity)
354 |             np.fill_diagonal(similarity_matrix, 1.0)
355 |         
356 |         elif metric == "dot":
357 |             # Compute dot products
358 |             similarity_matrix = np.dot(vectors, vectors.T)
359 |         
360 |         else:
361 |             # Compute distances and convert to similarities
362 |             for i in range(n):
363 |                 for j in range(n):
364 |                     if i == j:
365 |                         similarity_matrix[i, j] = 1.0
366 |                     else:
367 |                         distance = VectorUtils.compute_distance(vectors[i], vectors[j])
368 |                         # Convert distance to similarity (inverse relationship)
369 |                         similarity_matrix[i, j] = 1.0 / (1.0 + distance)
370 |         
371 |         return similarity_matrix
372 |     
373 |     @staticmethod
374 |     def compute_diversity_score(vectors: np.ndarray, metric: str = "cosine") -> float:
375 |         """
376 |         Compute diversity score for a set of vectors.
377 |         
378 |         Args:
379 |             vectors: Array of vectors
380 |             metric: Distance metric to use
381 |             
382 |         Returns:
383 |             Diversity score (higher = more diverse)
384 |         """
385 |         if len(vectors) < 2:
386 |             return 0.0
387 |         
388 |         # Compute pairwise distances
389 |         total_distance = 0.0
390 |         count = 0
391 |         
392 |         for i in range(len(vectors)):
393 |             for j in range(i + 1, len(vectors)):
394 |                 distance = VectorUtils.compute_distance(vectors[i], vectors[j])
395 |                 total_distance += distance
396 |                 count += 1
397 |         
398 |         # Return average distance
399 |         return total_distance / count if count > 0 else 0.0
400 |     
401 |     @staticmethod
402 |     def compute_quality_score(
403 |         query: np.ndarray,
404 |         results: np.ndarray,
405 |         ground_truth: np.ndarray = None
406 |     ) -> Dict[str, float]:
407 |         """
408 |         Compute quality metrics for search results.
409 |         
410 |         Args:
411 |             query: Query vector
412 |             results: Retrieved result vectors
413 |             ground_truth: Ground truth relevant vectors (optional)
414 |             
415 |         Returns:
416 |             Dictionary of quality metrics
417 |         """
418 |         metrics = {}
419 |         
420 |         # Compute average distance to query
421 |         distances = np.array([
422 |             VectorUtils.cosine_distance(query, result) for result in results
423 |         ])
424 |         
425 |         metrics["avg_distance"] = float(np.mean(distances))
426 |         metrics["min_distance"] = float(np.min(distances))
427 |         metrics["max_distance"] = float(np.max(distances))
428 |         metrics["std_distance"] = float(np.std(distances))
429 |         
430 |         # Compute diversity of results
431 |         if len(results) > 1:
432 |             metrics["diversity"] = VectorUtils.compute_diversity_score(results)
433 |         
434 |         # Compute precision/recall if ground truth is provided
435 |         if ground_truth is not None:
436 |             # This is a simplified implementation
437 |             # In practice, you'd need relevance scores or binary relevance
438 |             metrics["num_relevant"] = len(ground_truth)
439 |             metrics["num_retrieved"] = len(results)
440 |         
441 |         return metrics
442 |     
443 |     def get_stats(self) -> Dict[str, Any]:
444 |         """Get vector utilities statistics."""
445 |         return {
446 |             "distance_metric": self.distance_metric,
447 |             "supported_metrics": [metric.value for metric in DistanceMetrics]
448 |         }
449 | 
450 | 
451 | class PerformanceOptimizer:
452 |     """
453 |     Performance optimization utilities for vector operations.
454 |     """
455 |     
456 |     @staticmethod
457 |     def optimize_batch_size(
458 |         vector_dimension: int,
459 |         available_memory: int = None,
460 |         target_latency: float = 0.1
461 |     ) -> int:
462 |         """
463 |         Optimize batch size for vector operations.
464 |         
465 |         Args:
466 |             vector_dimension: Dimension of vectors
467 |             available_memory: Available memory in bytes
468 |             target_latency: Target latency in seconds
469 |             
470 |         Returns:
471 |             Optimal batch size
472 |         """
473 |         # Estimate memory per vector (float32)
474 |         bytes_per_vector = vector_dimension * 4
475 |         
476 |         if available_memory:
477 |             # Use 80% of available memory
478 |             usable_memory = int(available_memory * 0.8)
479 |             max_batch_size = usable_memory // bytes_per_vector
480 |         else:
481 |             # Default to reasonable batch size
482 |             max_batch_size = 10000
483 |         
484 |         # Adjust based on dimension
485 |         if vector_dimension > 1000:
486 |             max_batch_size = min(max_batch_size, 1000)
487 |         elif vector_dimension > 100:
488 |             max_batch_size = min(max_batch_size, 5000)
489 |         
490 |         return max(1, min(max_batch_size, 100000))
491 |     
492 |     @staticmethod
493 |     def estimate_memory_usage(
494 |         num_vectors: int,
495 |         vector_dimension: int,
496 |         include_index: bool = True
497 |     ) -> Dict[str, int]:
498 |         """
499 |         Estimate memory usage for vector storage.
500 |         
501 |         Args:
502 |             num_vectors: Number of vectors
503 |             vector_dimension: Dimension of vectors
504 |             include_index: Whether to include index memory
505 |             
506 |         Returns:
507 |             Dictionary of memory usage estimates
508 |         """
509 |         # Vector storage (float32)
510 |         vector_memory = num_vectors * vector_dimension * 4
511 |         
512 |         # Metadata storage (estimated)
513 |         metadata_memory = num_vectors * 100  # Rough estimate
514 |         
515 |         # Index memory
516 |         index_memory = 0
517 |         if include_index:
518 |             # HNSW index memory (rough estimate)
519 |             index_memory = num_vectors * vector_dimension * 2
520 |         
521 |         total_memory = vector_memory + metadata_memory + index_memory
522 |         
523 |         return {
524 |             "vectors": vector_memory,
525 |             "metadata": metadata_memory,
526 |             "index": index_memory,
527 |             "total": total_memory
528 |         }
529 |     
530 |     @staticmethod
531 |     def get_optimal_threads(num_vectors: int, vector_dimension: int) -> int:
532 |         """
533 |         Get optimal number of threads for parallel processing.
534 |         
535 |         Args:
536 |             num_vectors: Number of vectors
537 |             vector_dimension: Dimension of vectors
538 |             
539 |         Returns:
540 |             Optimal number of threads
541 |         """
542 |         import os
543 |         
544 |         # Get CPU count
545 |         cpu_count = os.cpu_count() or 1
546 |         
547 |         # Adjust based on workload
548 |         if num_vectors < 1000 or vector_dimension < 100:
549 |             return min(cpu_count, 2)
550 |         elif num_vectors < 10000:
551 |             return min(cpu_count, 4)
552 |         else:
553 |             return min(cpu_count, 8)
554 | 


--------------------------------------------------------------------------------
/octanedb/storage.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Storage manager for efficient persistence and loading of vector databases.
  3 | """
  4 | 
  5 | import numpy as np
  6 | import h5py
  7 | import msgpack
  8 | import logging
  9 | import time
 10 | from typing import Dict, Any, Optional, List, Union
 11 | from pathlib import Path
 12 | from collections import defaultdict
 13 | 
 14 | logger = logging.getLogger(__name__)
 15 | 
 16 | 
 17 | class StorageManager:
 18 |     """
 19 |     Storage manager for efficient persistence and loading of vector databases.
 20 |     
 21 |     Features:
 22 |     - HDF5-based storage with compression
 23 |     - Efficient metadata serialization
 24 |     - Incremental saves and loads
 25 |     - Memory-mapped access for large datasets
 26 |     """
 27 |     
 28 |     def __init__(
 29 |         self,
 30 |         storage_path: Optional[Path] = None,
 31 |         enable_cache: bool = True,
 32 |         cache_size: int = 10000,
 33 |         compression: str = "gzip",
 34 |         compression_opts: int = 6
 35 |     ):
 36 |         """
 37 |         Initialize storage manager.
 38 |         
 39 |         Args:
 40 |             storage_path: Base path for storage
 41 |             enable_cache: Enable caching for better performance
 42 |             cache_size: Maximum cache size
 43 |             compression: Compression algorithm
 44 |             compression_opts: Compression options
 45 |         """
 46 |         self.storage_path = storage_path
 47 |         self.enable_cache = enable_cache
 48 |         self.cache_size = cache_size
 49 |         self.compression = compression
 50 |         self.compression_opts = compression_opts
 51 |         
 52 |         # Cache for frequently accessed data
 53 |         self._cache: Dict[str, Any] = {}
 54 |         self._cache_hits = 0
 55 |         self._cache_misses = 0
 56 |         
 57 |         # Create storage directory if it doesn't exist
 58 |         if self.storage_path:
 59 |             self.storage_path.mkdir(parents=True, exist_ok=True)
 60 |         
 61 |         logger.info(f"Storage manager initialized at {self.storage_path}")
 62 |     
 63 |     def save_database(self, collections: Dict[str, Any], save_path: Path) -> None:
 64 |         """
 65 |         Save the entire database to disk.
 66 |         
 67 |         Args:
 68 |             collections: Dictionary of collections to save
 69 |             save_path: Path to save the database
 70 |         """
 71 |         start_time = time.time()
 72 |         
 73 |         # Create HDF5 file
 74 |         with h5py.File(save_path, 'w') as f:
 75 |             # Save database metadata
 76 |             self._save_database_metadata(f, collections)
 77 |             
 78 |             # Save each collection
 79 |             for collection_name, collection in collections.items():
 80 |                 self._save_collection(f, collection_name, collection)
 81 |         
 82 |         save_time = time.time() - start_time
 83 |         logger.info(f"Database saved to {save_path} in {save_time:.4f}s")
 84 |     
 85 |     def _save_database_metadata(self, f: h5py.File, collections: Dict[str, Any]) -> None:
 86 |         """Save database-level metadata."""
 87 |         # Database info
 88 |         db_info = {
 89 |             "version": "0.1.0",
 90 |             "created_at": time.time(),
 91 |             "num_collections": len(collections),
 92 |             "total_vectors": sum(c.count() for c in collections.values())
 93 |         }
 94 |         
 95 |         # Collection metadata
 96 |         collection_metadata = {}
 97 |         for name, collection in collections.items():
 98 |             collection_metadata[name] = {
 99 |                 "dimension": collection.dimension,
100 |                 "index_type": collection.index_type,
101 |                 "m": collection.m,
102 |                 "ef_construction": collection.ef_construction,
103 |                 "ef_search": collection.ef_search,
104 |                 "max_elements": collection.max_elements,
105 |                 "distance_metric": collection.distance_metric,
106 |                 "vector_count": collection.count(),
107 |                 "metadata_count": len(collection._metadata)
108 |             }
109 |         
110 |         # Save as datasets to avoid HDF5 attribute limitations
111 |         f.create_dataset('db_info', data=np.frombuffer(msgpack.packb(db_info), dtype=np.uint8))
112 |         f.create_dataset('collection_metadata', data=np.frombuffer(msgpack.packb(collection_metadata), dtype=np.uint8))
113 |     
114 |     def _save_collection(self, f: h5py.File, collection_name: str, collection: Any) -> None:
115 |         """Save a single collection."""
116 |         # Create collection group
117 |         collection_group = f.create_group(collection_name)
118 |         
119 |         # Save collection metadata
120 |         collection_group.attrs['name'] = collection_name
121 |         collection_group.attrs['dimension'] = collection.dimension
122 |         collection_group.attrs['index_type'] = collection.index_type
123 |         collection_group.attrs['distance_metric'] = collection.distance_metric
124 |         
125 |         # Save vectors
126 |         if collection._vectors:
127 |             vectors_array = np.array([v for v in collection._vectors.values()], dtype=np.float32)
128 |             vector_ids = list(collection._vectors.keys())
129 |             
130 |             # Save vectors with compression
131 |             collection_group.create_dataset(
132 |                 'vectors',
133 |                 data=vectors_array,
134 |                 compression=self.compression,
135 |                 compression_opts=self.compression_opts
136 |             )
137 |             
138 |             # Save vector IDs
139 |             collection_group.create_dataset(
140 |                 'vector_ids',
141 |                 data=vector_ids,
142 |                 compression=self.compression,
143 |                 compression_opts=self.compression_opts
144 |             )
145 |         
146 |         # Save metadata
147 |         if collection._metadata:
148 |             # Convert integer keys to strings for msgpack compatibility
149 |             metadata_for_storage = {str(k): v for k, v in collection._metadata.items()}
150 |             # Convert metadata to bytes
151 |             metadata_bytes = msgpack.packb(metadata_for_storage)
152 |             collection_group.create_dataset(
153 |                 'metadata',
154 |                 data=np.frombuffer(metadata_bytes, dtype=np.uint8),
155 |                 compression=self.compression,
156 |                 compression_opts=self.compression_opts
157 |             )
158 |         
159 |         # Save index if built
160 |         if collection._index and collection._index_built:
161 |             self._save_index(collection_group, collection._index)
162 |         
163 |         # Save collection stats
164 |         stats = collection.get_stats()
165 |         collection_group.create_dataset('stats', data=np.frombuffer(msgpack.packb(stats), dtype=np.uint8))
166 |     
167 |     def _save_index(self, collection_group: h5py.Group, index) -> None:
168 |         """Save index data."""
169 |         if hasattr(index, 'get_stats'):
170 |             index_stats = index.get_stats()
171 |             collection_group.create_dataset('index_stats', data=np.frombuffer(msgpack.packb(index_stats), dtype=np.uint8))
172 |     
173 |     def load_database(self, load_path: Path) -> Dict[str, Any]:
174 |         """
175 |         Load the entire database from disk.
176 |         
177 |         Args:
178 |             load_path: Path to load the database from
179 |             
180 |         Returns:
181 |             Dictionary of loaded collections
182 |         """
183 |         start_time = time.time()
184 |         
185 |         collections = {}
186 |         
187 |         with h5py.File(load_path, 'r') as f:
188 |             # Load database metadata from datasets
189 |             db_info = msgpack.unpackb(bytes(f['db_info'][:]))
190 |             collection_metadata = msgpack.unpackb(bytes(f['collection_metadata'][:]))
191 |             
192 |             # Load each collection
193 |             for collection_name in f.keys():
194 |                 if collection_name not in ['db_info', 'collection_metadata']:
195 |                     collection = self._load_collection(f[collection_name], collection_name)
196 |                     collections[collection_name] = collection
197 |         
198 |         load_time = time.time() - start_time
199 |         logger.info(f"Database loaded from {load_path} in {load_time:.4f}s")
200 |         
201 |         return collections
202 |     
203 |     def load_database_metadata(self, load_path: Path) -> Dict[str, Any]:
204 |         """Load only database metadata without loading vectors."""
205 |         with h5py.File(load_path, 'r') as f:
206 |             db_info = msgpack.unpackb(bytes(f['db_info'][:]))
207 |             collection_metadata = msgpack.unpackb(bytes(f['collection_metadata'][:]))
208 |             
209 |             # Combine metadata
210 |             metadata = db_info.copy()
211 |             metadata['collections'] = collection_metadata
212 |             
213 |             return metadata
214 |     
215 |     def _load_collection(self, collection_group: h5py.Group, collection_name: str) -> Any:
216 |         """Load a single collection."""
217 |         # Get collection metadata from attributes (primary source)
218 |         dimension = collection_group.attrs.get('dimension')
219 |         index_type = collection_group.attrs.get('index_type', 'hnsw')
220 |         distance_metric = collection_group.attrs.get('distance_metric', 'cosine')
221 |         
222 |         # Validate that we have the essential dimension
223 |         if dimension is None:
224 |             raise ValueError(f"Collection '{collection_name}' missing dimension information")
225 |         
226 |         # Create collection using string import to avoid circular dependency
227 |         from .collection import Collection
228 |         collection = Collection(
229 |             name=collection_name,
230 |             dimension=dimension,
231 |             index_type=index_type,
232 |             distance_metric=distance_metric
233 |         )
234 |         
235 |         # Load vectors
236 |         if 'vectors' in collection_group and 'vector_ids' in collection_group:
237 |             vectors = collection_group['vectors'][:]
238 |             vector_ids = collection_group['vector_ids'][:].tolist()
239 |             
240 |             # Restore vectors
241 |             for i, vector_id in enumerate(vector_ids):
242 |                 collection._vectors[vector_id] = vectors[i]
243 |                 collection._next_id = max(collection._next_id, vector_id + 1)
244 |         
245 |         # Load metadata
246 |         if 'metadata' in collection_group:
247 |             metadata_bytes = collection_group['metadata'][:].tobytes()
248 |             loaded_metadata = msgpack.unpackb(metadata_bytes)
249 |             # Convert string keys back to integers
250 |             collection._metadata = {int(k): v for k, v in loaded_metadata.items()}
251 |         
252 |         # Load stats (already loaded above for dimension, etc.)
253 |         if 'stats' in collection_group:
254 |             stats_bytes = collection_group['stats'][:].tobytes()
255 |             stats = msgpack.unpackb(stats_bytes)
256 |             collection._stats.update(stats)
257 |         
258 |         # Mark index as needing rebuild
259 |         collection._index_built = False
260 |         collection._index_needs_rebuild = True
261 |         
262 |         logger.debug(f"Loaded collection '{collection_name}' with {len(collection._vectors)} vectors")
263 |         
264 |         return collection
265 |     
266 |     def save_collection(self, collection: Any, collection_name: str) -> None:
267 |         """
268 |         Save a single collection.
269 |         
270 |         Args:
271 |             collection: Collection object to save
272 |             collection_name: Name of the collection
273 |         """
274 |         if not self.storage_path:
275 |             raise ValueError("Storage path not set")
276 |         
277 |         collection_path = self.storage_path / f"{collection_name}.h5"
278 |         
279 |         with h5py.File(collection_path, 'w') as f:
280 |             self._save_collection(f, collection_name, collection)
281 |         
282 |         logger.info(f"Collection '{collection_name}' saved to {collection_path}")
283 |     
284 |     def load_collection(self, collection_name: str) -> Any:
285 |         """
286 |         Load a single collection.
287 |         
288 |         Args:
289 |             collection_name: Name of the collection to load
290 |             
291 |         Returns:
292 |             Loaded collection
293 |         """
294 |         if not self.storage_path:
295 |             raise ValueError("Storage path not set")
296 |         
297 |         collection_path = self.storage_path / f"{collection_name}.h5"
298 |         
299 |         if not collection_path.exists():
300 |             raise FileNotFoundError(f"Collection file not found: {collection_path}")
301 |         
302 |         with h5py.File(collection_path, 'r') as f:
303 |             collection = self._load_collection(f, collection_name)
304 |         
305 |         logger.info(f"Collection '{collection_name}' loaded from {collection_path}")
306 |         return collection
307 |     
308 |     def delete_collection(self, collection_name: str) -> None:
309 |         """
310 |         Delete a collection from storage.
311 |         
312 |         Args:
313 |             collection_name: Name of the collection to delete
314 |         """
315 |         if not self.storage_path:
316 |             return
317 |         
318 |         collection_path = self.storage_path / f"{collection_name}.h5"
319 |         
320 |         if collection_path.exists():
321 |             collection_path.unlink()
322 |             logger.info(f"Collection '{collection_name}' deleted from storage")
323 |     
324 |     def list_saved_collections(self) -> List[str]:
325 |         """List all saved collections."""
326 |         if not self.storage_path:
327 |             return []
328 |         
329 |         collections = []
330 |         for file_path in self.storage_path.glob("*.h5"):
331 |             collections.append(file_path.stem)
332 |         
333 |         return collections
334 |     
335 |     def get_collection_info(self, collection_name: str) -> Optional[Dict[str, Any]]:
336 |         """
337 |         Get information about a saved collection without loading it.
338 |         
339 |         Args:
340 |             collection_name: Name of the collection
341 |             
342 |         Returns:
343 |             Collection information or None if not found
344 |         """
345 |         if not self.storage_path:
346 |             return None
347 |         
348 |         collection_path = self.storage_path / f"{collection_name}.h5"
349 |         
350 |         if not collection_path.exists():
351 |             return None
352 |         
353 |         try:
354 |             with h5py.File(collection_path, 'r') as f:
355 |                 info = {
356 |                     'name': collection_name,
357 |                     'dimension': f.attrs.get('dimension'),
358 |                     'index_type': f.attrs.get('index_type'),
359 |                     'distance_metric': f.attrs.get('distance_metric'),
360 |                     'vector_count': f['vectors'].shape[0] if 'vectors' in f else 0,
361 |                     'metadata_count': len(f['metadata']) if 'metadata' in f else 0
362 |                 }
363 |                 
364 |                 if 'stats' in f.attrs:
365 |                     stats = msgpack.unpackb(f.attrs['stats'])
366 |                     info.update(stats)
367 |                 
368 |                 return info
369 |         except Exception as e:
370 |             logger.error(f"Error reading collection info for '{collection_name}': {e}")
371 |             return None
372 |     
373 |     def optimize_storage(self, collection_name: str) -> None:
374 |         """
375 |         Optimize storage for a collection (e.g., recompress with better settings).
376 |         
377 |         Args:
378 |             collection_name: Name of the collection to optimize
379 |         """
380 |         if not self.storage_path:
381 |             return
382 |         
383 |         collection_path = self.storage_path / f"{collection_name}.h5"
384 |         
385 |         if not collection_path.exists():
386 |             logger.warning(f"Collection '{collection_name}' not found for optimization")
387 |             return
388 |         
389 |         # Load and resave with optimized compression
390 |         try:
391 |             with h5py.File(collection_path, 'r') as f:
392 |                 collection = self._load_collection(f, collection_name)
393 |             
394 |             # Save with optimized settings
395 |             optimized_path = collection_path.with_suffix('.optimized.h5')
396 |             with h5py.File(optimized_path, 'w') as f:
397 |                 self._save_collection(f, collection_name, collection)
398 |             
399 |             # Replace original file
400 |             collection_path.unlink()
401 |             optimized_path.rename(collection_path)
402 |             
403 |             logger.info(f"Storage optimized for collection '{collection_name}'")
404 |             
405 |         except Exception as e:
406 |             logger.error(f"Error optimizing storage for collection '{collection_name}': {e}")
407 |     
408 |     def get_storage_stats(self) -> Dict[str, Any]:
409 |         """Get storage statistics."""
410 |         stats = {
411 |             'storage_path': str(self.storage_path) if self.storage_path else None,
412 |             'cache_enabled': self.enable_cache,
413 |             'cache_size': self.cache_size,
414 |             'cache_hits': self._cache_hits,
415 |             'cache_misses': self._cache_misses,
416 |             'cache_hit_rate': self._cache_hits / (self._cache_hits + self._cache_misses) if (self._cache_hits + self._cache_misses) > 0 else 0
417 |         }
418 |         
419 |         if self.storage_path:
420 |             # File system stats
421 |             total_size = 0
422 |             file_count = 0
423 |             for file_path in self.storage_path.glob("*.h5"):
424 |                 total_size += file_path.stat().st_size
425 |                 file_count += 1
426 |             
427 |             stats['total_storage_size'] = total_size
428 |             stats['file_count'] = file_count
429 |             stats['saved_collections'] = self.list_saved_collections()
430 |         
431 |         return stats
432 |     
433 |     def clear_cache(self) -> None:
434 |         """Clear the storage cache."""
435 |         self._cache.clear()
436 |         self._cache_hits = 0
437 |         self._cache_misses = 0
438 |         logger.info("Storage cache cleared")
439 |     
440 |     def _get_cache_key(self, key: str) -> str:
441 |         """Generate a cache key."""
442 |         return f"storage_{key}"
443 |     
444 |     def _get_from_cache(self, key: str) -> Optional[Any]:
445 |         """Get value from cache."""
446 |         if not self.enable_cache:
447 |             return None
448 |         
449 |         cache_key = self._get_cache_key(key)
450 |         if cache_key in self._cache:
451 |             self._cache_hits += 1
452 |             return self._cache[cache_key]
453 |         
454 |         self._cache_misses += 1
455 |         return None
456 |     
457 |     def _set_cache(self, key: str, value: Any) -> None:
458 |         """Set value in cache."""
459 |         if not self.enable_cache:
460 |             return
461 |         
462 |         cache_key = self._get_cache_key(key)
463 |         
464 |         # Implement LRU cache
465 |         if len(self._cache) >= self.cache_size:
466 |             # Remove oldest item (simple implementation)
467 |             oldest_key = next(iter(self._cache))
468 |             del self._cache[oldest_key]
469 |         
470 |         self._cache[cache_key] = value
471 | 


--------------------------------------------------------------------------------
/octanedb/index.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Index implementations for fast vector similarity search.
  3 | """
  4 | 
  5 | import numpy as np
  6 | import logging
  7 | from typing import List, Tuple, Optional, Dict, Any
  8 | from enum import Enum
  9 | import time
 10 | 
 11 | logger = logging.getLogger(__name__)
 12 | 
 13 | 
 14 | class IndexType(Enum):
 15 |     """Supported index types."""
 16 |     HNSW = "hnsw"
 17 |     FLAT = "flat"
 18 |     IVF = "ivf"
 19 | 
 20 | 
 21 | class HNSWIndex:
 22 |     """
 23 |     Hierarchical Navigable Small World (HNSW) index for fast vector similarity search.
 24 |     
 25 |     HNSW is a graph-based index that provides:
 26 |     - Sub-linear search complexity
 27 |     - High accuracy for similarity search
 28 |     - Efficient construction and updates
 29 |     - Configurable search depth and connections
 30 |     """
 31 |     
 32 |     def __init__(
 33 |         self,
 34 |         dimension: int,
 35 |         m: int = 16,
 36 |         ef_construction: int = 200,
 37 |         ef_search: int = 100,
 38 |         max_elements: int = 1000000,
 39 |         distance_metric: str = "cosine"
 40 |     ):
 41 |         """
 42 |         Initialize HNSW index.
 43 |         
 44 |         Args:
 45 |             dimension: Vector dimension
 46 |             m: Maximum number of connections per layer
 47 |             ef_construction: Construction search depth
 48 |             ef_search: Search depth
 49 |             max_elements: Maximum number of vectors
 50 |             distance_metric: Distance metric for similarity
 51 |         """
 52 |         self.dimension = dimension
 53 |         self.m = m
 54 |         self.ef_construction = ef_construction
 55 |         self.ef_search = ef_search
 56 |         self.max_elements = max_elements
 57 |         self.distance_metric = distance_metric
 58 |         
 59 |         # Graph structure
 60 |         self._layers: List[Dict[int, List[int]]] = []  # Adjacency lists for each layer
 61 |         self._vectors: Optional[np.ndarray] = None
 62 |         self._vector_ids: Optional[List[int]] = None
 63 |         self._max_layer: int = 0
 64 |         
 65 |         # Entry point
 66 |         self._entry_point: Optional[int] = None
 67 |         self._entry_layer: int = 0
 68 |         
 69 |         # Distance computation
 70 |         self._distance_func = self._get_distance_function()
 71 |         
 72 |         logger.info(f"HNSW index initialized with m={m}, ef_construction={ef_construction}")
 73 |     
 74 |     def _get_distance_function(self):
 75 |         """Get the appropriate distance function."""
 76 |         if self.distance_metric == "cosine":
 77 |             return self._cosine_distance
 78 |         elif self.distance_metric == "euclidean":
 79 |             return self._euclidean_distance
 80 |         elif self.distance_metric == "dot":
 81 |             return self._dot_distance
 82 |         else:
 83 |             raise ValueError(f"Unsupported distance metric: {self.distance_metric}")
 84 |     
 85 |     def _cosine_distance(self, a: np.ndarray, b: np.ndarray) -> float:
 86 |         """Compute cosine distance between vectors."""
 87 |         dot_product = np.dot(a, b)
 88 |         norm_a = np.linalg.norm(a)
 89 |         norm_b = np.linalg.norm(b)
 90 |         if norm_a == 0 or norm_b == 0:
 91 |             return 1.0
 92 |         return 1.0 - (dot_product / (norm_a * norm_b))
 93 |     
 94 |     def _euclidean_distance(self, a: np.ndarray, b: np.ndarray) -> float:
 95 |         """Compute Euclidean distance between vectors."""
 96 |         return np.linalg.norm(a - b)
 97 |     
 98 |     def _dot_distance(self, a: np.ndarray, b: np.ndarray) -> float:
 99 |         """Compute negative dot product distance."""
100 |         return -np.dot(a, b)
101 |     
102 |     def build(self, vectors: np.ndarray, vector_ids: List[int]) -> None:
103 |         """
104 |         Build the HNSW index from vectors.
105 |         
106 |         Args:
107 |             vectors: Array of vectors to index
108 |             vector_ids: Corresponding vector IDs
109 |         """
110 |         if len(vectors) == 0:
111 |             logger.warning("No vectors to index")
112 |             return
113 |         
114 |         start_time = time.time()
115 |         
116 |         # Store vectors and IDs
117 |         self._vectors = vectors.astype(np.float32)
118 |         self._vector_ids = vector_ids
119 |         
120 |         # Initialize layers
121 |         self._layers = [{} for _ in range(self._get_max_layer(len(vectors)) + 1)]
122 |         
123 |         # Insert vectors one by one
124 |         for i in range(len(vectors)):
125 |             self._insert_vector(i, vector_ids[i])
126 |             logger.debug(f"Inserted vector {i} (ID: {vector_ids[i]}) at layer {self._layers[-1] if self._layers else 'N/A'}")
127 |         
128 |         build_time = time.time() - start_time
129 |         logger.info(f"HNSW index built for {len(vectors)} vectors in {build_time:.4f}s")
130 |         logger.info(f"Final entry point: {self._entry_point}, entry layer: {self._entry_layer}")
131 |         logger.info(f"Number of layers: {len(self._layers)}")
132 |     
133 |     def _get_max_layer(self, num_vectors: int) -> int:
134 |         """Calculate maximum layer for the given number of vectors."""
135 |         return max(0, int(np.log(num_vectors) / np.log(self.m)))
136 |     
137 |     def _insert_vector(self, vector_idx: int, vector_id: int) -> None:
138 |         """Insert a single vector into the index."""
139 |         # Determine layer for this vector
140 |         layer = self._get_random_layer()
141 |         logger.debug(f"Vector {vector_idx} (ID: {vector_id}) assigned to layer {layer}")
142 |         
143 |         # Find nearest neighbors in current layer and lower layers
144 |         if self._entry_point is not None:
145 |             nearest = self._search_layer(
146 |                 self._vectors[vector_idx], 
147 |                 self._entry_point, 
148 |                 self.ef_construction, 
149 |                 layer
150 |             )
151 |             
152 |             # If no neighbors found in current layer, search in lower layers
153 |             if not nearest and layer > 0:
154 |                 for lower_layer in range(layer - 1, -1, -1):
155 |                     lower_nearest = self._search_layer(
156 |                         self._vectors[vector_idx],
157 |                         self._entry_point,
158 |                         self.ef_construction,
159 |                         lower_layer
160 |                     )
161 |                     if lower_nearest:
162 |                         nearest = lower_nearest
163 |                         break
164 |             
165 |             # If still no neighbors found, create a simple connection to the entry point
166 |             if not nearest:
167 |                 nearest = [(self._entry_point, self._distance_func(
168 |                     self._vectors[vector_idx], 
169 |                     self._vectors[self._entry_point]
170 |                 ))]
171 |         else:
172 |             nearest = []
173 |         
174 |         # Add connections in current layer
175 |         self._add_connections(vector_idx, nearest, layer)
176 |         
177 |         # Update entry point if needed
178 |         if self._entry_point is None or layer > self._entry_layer:
179 |             self._entry_point = vector_idx
180 |             self._entry_layer = layer
181 |             logger.debug(f"Updated entry point to {vector_idx} at layer {layer}")
182 |         
183 |         # Add to layers
184 |         for l in range(layer + 1):
185 |             if l not in self._layers:
186 |                 self._layers.append({})
187 |             self._layers[l][vector_idx] = []
188 |         
189 |         # Ensure the vector is also added to the bottom layer (layer 0) for searchability
190 |         if 0 not in self._layers:
191 |             self._layers.append({})
192 |         if vector_idx not in self._layers[0]:
193 |             self._layers[0][vector_idx] = []
194 |     
195 |     def _get_random_layer(self) -> int:
196 |         """Get random layer based on exponential distribution."""
197 |         # Use a smaller factor to avoid extremely high layers
198 |         # The original used self.m which was too large
199 |         factor = 1.0  # This gives more reasonable layer distribution
200 |         return int(-np.log(np.random.random()) * factor)
201 |     
202 |     def _search_layer(
203 |         self, 
204 |         query: np.ndarray, 
205 |         entry_point: int, 
206 |         ef: int, 
207 |         layer: int
208 |     ) -> List[Tuple[int, float]]:
209 |         """
210 |         Search for nearest neighbors in a specific layer.
211 |         
212 |         Args:
213 |             query: Query vector
214 |             entry_point: Starting point for search
215 |             ef: Search depth
216 |             layer: Layer to search in
217 |             
218 |         Returns:
219 |             List of (vector_idx, distance) tuples
220 |         """
221 |         if layer >= len(self._layers):
222 |             return []
223 |         
224 |         # Initialize candidates and visited
225 |         candidates = [(entry_point, self._distance_func(query, self._vectors[entry_point]))]
226 |         visited = {entry_point}
227 |         
228 |         # Search loop
229 |         while candidates:
230 |             # Get closest candidate
231 |             current_idx, current_dist = candidates.pop(0)
232 |             
233 |             # Check if we can improve
234 |             if len(candidates) >= ef and current_dist > candidates[-1][1]:
235 |                 break
236 |             
237 |             # Explore neighbors
238 |             for neighbor_idx in self._layers[layer].get(current_idx, []):
239 |                 if neighbor_idx in visited:
240 |                     continue
241 |                 
242 |                 visited.add(neighbor_idx)
243 |                 neighbor_dist = self._distance_func(query, self._vectors[neighbor_idx])
244 |                 
245 |                 # Add to candidates if better than worst
246 |                 if len(candidates) < ef or neighbor_dist < candidates[-1][1]:
247 |                     # Insert in sorted order
248 |                     insert_pos = 0
249 |                     for i, (_, dist) in enumerate(candidates):
250 |                         if neighbor_dist < dist:
251 |                             insert_pos = i
252 |                             break
253 |                     
254 |                     candidates.insert(insert_pos, (neighbor_idx, neighbor_dist))
255 |                     
256 |                     # Keep only top ef candidates
257 |                     if len(candidates) > ef:
258 |                         candidates = candidates[:ef]
259 |         
260 |         return candidates
261 |     
262 |     def _add_connections(self, vector_idx: int, nearest: List[Tuple[int, float]], layer: int) -> None:
263 |         """Add connections for a vector in a specific layer."""
264 |         if layer >= len(self._layers):
265 |             return
266 |         
267 |         # Sort by distance
268 |         nearest.sort(key=lambda x: x[1])
269 |         
270 |         # Add bidirectional connections in the current layer
271 |         for neighbor_idx, _ in nearest[:self.m]:
272 |             # Add connection from vector to neighbor
273 |             if vector_idx not in self._layers[layer]:
274 |                 self._layers[layer][vector_idx] = []
275 |             self._layers[layer][vector_idx].append(neighbor_idx)
276 |             
277 |             # Add connection from neighbor to vector
278 |             if neighbor_idx not in self._layers[layer]:
279 |                 self._layers[layer][neighbor_idx] = []
280 |             self._layers[layer][neighbor_idx].append(vector_idx)
281 |         
282 |         # Also add connections in the bottom layer (layer 0) for better searchability
283 |         if layer > 0 and 0 < len(self._layers):
284 |             for neighbor_idx, _ in nearest[:self.m]:
285 |                 # Add connection from vector to neighbor in bottom layer
286 |                 if vector_idx not in self._layers[0]:
287 |                     self._layers[0][vector_idx] = []
288 |                 if neighbor_idx not in self._layers[0][vector_idx]:
289 |                     self._layers[0][vector_idx].append(neighbor_idx)
290 |                 
291 |                 # Add connection from neighbor to vector in bottom layer
292 |                 if neighbor_idx not in self._layers[0]:
293 |                     self._layers[0][neighbor_idx] = []
294 |                 if vector_idx not in self._layers[0][neighbor_idx]:
295 |                     self._layers[0][neighbor_idx].append(vector_idx)
296 |     
297 |     def search(self, query: np.ndarray, k: int) -> List[Tuple[int, float]]:
298 |         """
299 |         Search for k nearest neighbors.
300 |         
301 |         Args:
302 |             query: Query vector
303 |             k: Number of results to return
304 |             
305 |         Returns:
306 |             List of (vector_id, distance) tuples
307 |         """
308 |         if self._entry_point is None:
309 |             logger.warning("HNSW search failed: entry_point is None")
310 |             return []
311 |         
312 |         logger.debug(f"HNSW search: entry_point={self._entry_point}, entry_layer={self._entry_layer}, layers={len(self._layers)}")
313 |         
314 |         # Start from top layer
315 |         current_layer = self._entry_layer
316 |         current_point = self._entry_point
317 |         
318 |         # Search down through layers
319 |         while current_layer > 0:
320 |             nearest = self._search_layer(query, current_point, 1, current_layer)
321 |             if nearest:
322 |                 current_point = nearest[0][0]
323 |             current_layer -= 1
324 |         
325 |         # Search in bottom layer (layer 0) where most connections are
326 |         nearest = self._search_layer(query, current_point, self.ef_search, 0)
327 |         
328 |         # If no results found in bottom layer, try searching from all vectors in bottom layer
329 |         if not nearest and len(self._layers) > 0 and self._layers[0]:
330 |             all_candidates = []
331 |             for vector_idx in self._layers[0].keys():
332 |                 distance = self._distance_func(query, self._vectors[vector_idx])
333 |                 all_candidates.append((vector_idx, distance))
334 |             
335 |             # Sort by distance and take top k
336 |             all_candidates.sort(key=lambda x: x[1])
337 |             nearest = all_candidates[:k]
338 |         
339 |         # Convert to vector IDs and return top k
340 |         results = []
341 |         for vector_idx, distance in nearest[:k]:
342 |             vector_id = self._vector_ids[vector_idx]
343 |             results.append((vector_id, distance))
344 |         
345 |         return results
346 |     
347 |     def search_batch(self, queries: np.ndarray, k: int) -> List[List[Tuple[int, float]]]:
348 |         """
349 |         Batch search for multiple queries.
350 |         
351 |         Args:
352 |             queries: Array of query vectors
353 |             k: Number of results per query
354 |             
355 |         Returns:
356 |             List of result lists for each query
357 |         """
358 |         results = []
359 |         for query in queries:
360 |             query_results = self.search(query, k)
361 |             results.append(query_results)
362 |         return results
363 |     
364 |     def optimize(self) -> None:
365 |         """Optimize the index structure."""
366 |         # Rebalance connections if needed
367 |         for layer in range(len(self._layers)):
368 |             for vector_idx in list(self._layers[layer].keys()):
369 |                 connections = self._layers[layer][vector_idx]
370 |                 if len(connections) > self.m * 2:
371 |                     # Keep only closest connections
372 |                     if self._vectors is not None:
373 |                         distances = [
374 |                             (neighbor_idx, self._distance_func(
375 |                                 self._vectors[vector_idx], 
376 |                                 self._vectors[neighbor_idx]
377 |                             ))
378 |                             for neighbor_idx in connections
379 |                         ]
380 |                         distances.sort(key=lambda x: x[1])
381 |                         best_connections = [idx for idx, _ in distances[:self.m]]
382 |                         
383 |                         # Update connections
384 |                         self._layers[layer][vector_idx] = best_connections
385 |                         
386 |                         # Remove reverse connections
387 |                         for neighbor_idx in best_connections:
388 |                             if neighbor_idx in self._layers[layer]:
389 |                                 if vector_idx in self._layers[layer][neighbor_idx]:
390 |                                     self._layers[layer][neighbor_idx].remove(vector_idx)
391 |         
392 |         logger.info("HNSW index optimization completed")
393 |     
394 |     def get_stats(self) -> Dict[str, Any]:
395 |         """Get index statistics."""
396 |         stats = {
397 |             "index_type": "hnsw",
398 |             "dimension": self.dimension,
399 |             "m": self.m,
400 |             "ef_construction": self.ef_construction,
401 |             "ef_search": self.ef_search,
402 |             "max_elements": self.max_elements,
403 |             "distance_metric": self.distance_metric,
404 |             "num_vectors": len(self._vector_ids) if self._vector_ids else 0,
405 |             "num_layers": len(self._layers),
406 |             "entry_point": self._entry_point,
407 |             "entry_layer": self._entry_layer
408 |         }
409 |         
410 |         # Connection statistics
411 |         if self._layers:
412 |             total_connections = sum(len(connections) for connections in self._layers[0].values())
413 |             avg_connections = total_connections / len(self._layers[0]) if self._layers[0] else 0
414 |             stats["total_connections"] = total_connections
415 |             stats["avg_connections"] = avg_connections
416 |         
417 |         return stats
418 | 
419 | 
420 | class FlatIndex:
421 |     """
422 |     Simple flat index for exact search (used for small datasets or testing).
423 |     """
424 |     
425 |     def __init__(self, dimension: int, distance_metric: str = "cosine"):
426 |         self.dimension = dimension
427 |         self.distance_metric = distance_metric
428 |         self._vectors: Optional[np.ndarray] = None
429 |         self._vector_ids: Optional[List[int]] = None
430 |         self._distance_func = self._get_distance_function()
431 |     
432 |     def _get_distance_function(self):
433 |         """Get the appropriate distance function."""
434 |         if self.distance_metric == "cosine":
435 |             return self._cosine_distance
436 |         elif self.distance_metric == "euclidean":
437 |             return self._euclidean_distance
438 |         elif self.distance_metric == "dot":
439 |             return self._dot_distance
440 |         else:
441 |             raise ValueError(f"Unsupported distance metric: {self.distance_metric}")
442 |     
443 |     def _cosine_distance(self, a: np.ndarray, b: np.ndarray) -> float:
444 |         """Compute cosine distance between vectors."""
445 |         dot_product = np.dot(a, b)
446 |         norm_a = np.linalg.norm(a)
447 |         norm_b = np.linalg.norm(b)
448 |         if norm_a == 0 or norm_b == 0:
449 |             return 1.0
450 |         return 1.0 - (dot_product / (norm_a * norm_b))
451 |     
452 |     def _euclidean_distance(self, a: np.ndarray, b: np.ndarray) -> float:
453 |         """Compute Euclidean distance between vectors."""
454 |         return np.linalg.norm(a - b)
455 |     
456 |     def _dot_distance(self, a: np.ndarray, b: np.ndarray) -> float:
457 |         """Compute negative dot product distance."""
458 |         return -np.dot(a, b)
459 |     
460 |     def build(self, vectors: np.ndarray, vector_ids: List[int]) -> None:
461 |         """Build the flat index."""
462 |         self._vectors = vectors.astype(np.float32)
463 |         self._vector_ids = vector_ids
464 |     
465 |     def search(self, query: np.ndarray, k: int) -> List[Tuple[int, float]]:
466 |         """Search for k nearest neighbors using exact computation."""
467 |         if self._vectors is None:
468 |             return []
469 |         
470 |         # Compute distances to all vectors
471 |         distances = []
472 |         for i, vector in enumerate(self._vectors):
473 |             distance = self._distance_func(query, vector)
474 |             distances.append((i, distance))
475 |         
476 |         # Sort by distance and return top k
477 |         distances.sort(key=lambda x: x[1])
478 |         
479 |         results = []
480 |         for vector_idx, distance in distances[:k]:
481 |             vector_id = self._vector_ids[vector_idx]
482 |             results.append((vector_id, distance))
483 |         
484 |         return results
485 |     
486 |     def search_batch(self, queries: np.ndarray, k: int) -> List[List[Tuple[int, float]]]:
487 |         """Batch search for multiple queries."""
488 |         results = []
489 |         for query in queries:
490 |             query_results = self.search(query, k)
491 |             results.append(query_results)
492 |         return results
493 |     
494 |     def optimize(self) -> None:
495 |         """No optimization needed for flat index."""
496 |         pass
497 |     
498 |     def get_stats(self) -> Dict[str, Any]:
499 |         """Get index statistics."""
500 |         return {
501 |             "index_type": "flat",
502 |             "dimension": self.dimension,
503 |             "distance_metric": self.distance_metric,
504 |             "num_vectors": len(self._vector_ids) if self._vector_ids else 0
505 |         }
506 | 


--------------------------------------------------------------------------------
/octanedb/core.py:
--------------------------------------------------------------------------------
  1 | """
  2 | OctaneDB Core - Main database interface.
  3 | Provides high-level operations for vector database management.
  4 | """
  5 | 
  6 | import logging
  7 | from typing import Dict, List, Optional, Union, Any, Tuple
  8 | from pathlib import Path
  9 | import numpy as np
 10 | 
 11 | from .collection import Collection
 12 | from .storage import StorageManager
 13 | from .embeddings import TextEmbedder, ChromaCompatibleEmbedder
 14 | 
 15 | logger = logging.getLogger(__name__)
 16 | 
 17 | 
 18 | class OctaneDB:
 19 |     """
 20 |     OctaneDB - Lightning Fast Vector Database.
 21 |     
 22 |     A lightweight, high-performance vector database library that provides:
 23 |     - Fast vector similarity search using HNSW indexing
 24 |     - Automatic text embedding generation
 25 |     - ChromaDB-compatible API
 26 |     - Multiple storage modes (in-memory, persistent, hybrid)
 27 |     - Advanced metadata filtering
 28 |     - Batch operations for improved performance
 29 |     """
 30 |     
 31 |     def __init__(
 32 |         self,
 33 |         dimension: int,
 34 |         index_type: str = "hnsw",
 35 |         m: int = 16,
 36 |         ef_construction: int = 200,
 37 |         ef_search: int = 100,
 38 |         max_elements: int = 1000000,
 39 |         distance_metric: str = "cosine",
 40 |         storage_path: Optional[Union[str, Path]] = None,
 41 |         embedding_model: Optional[str] = None,
 42 |         enable_text_embeddings: bool = True,
 43 |         **kwargs
 44 |     ):
 45 |         """
 46 |         Initialize OctaneDB.
 47 |         
 48 |         Args:
 49 |             dimension: Vector dimension
 50 |             index_type: Type of index to use
 51 |             m: HNSW connections per layer
 52 |             ef_construction: Construction search depth
 53 |             ef_search: Search depth
 54 |             max_elements: Maximum number of vectors
 55 |             distance_metric: Distance metric for similarity
 56 |             storage_path: Path for persistent storage
 57 |             embedding_model: Sentence-transformers model for text embeddings
 58 |             enable_text_embeddings: Whether to enable text embedding functionality
 59 |             **kwargs: Additional arguments for collections
 60 |         """
 61 |         self.dimension = dimension
 62 |         self.index_type = index_type
 63 |         self.m = m
 64 |         self.ef_construction = ef_construction
 65 |         self.ef_search = ef_search
 66 |         self.max_elements = max_elements
 67 |         self.distance_metric = distance_metric
 68 |         self.embedding_model = embedding_model
 69 |         self.enable_text_embeddings = enable_text_embeddings
 70 |         
 71 |         # Storage management
 72 |         self._storage_path = Path(storage_path) if storage_path else None
 73 |         self._storage_manager = StorageManager(storage_path=self._storage_path) if self._storage_path else None
 74 |         
 75 |         # Collection management
 76 |         self._collections: Dict[str, Collection] = {}
 77 |         self._current_collection: Optional[Collection] = None
 78 |         
 79 |         # Text embedding support
 80 |         self._text_embedder: Optional[TextEmbedder] = None
 81 |         if enable_text_embeddings and embedding_model:
 82 |             try:
 83 |                 self._text_embedder = TextEmbedder(embedding_model)
 84 |                 # Update dimension if it doesn't match
 85 |                 if self._text_embedder.dimension != dimension:
 86 |                     logger.warning(f"Embedding model dimension ({self._text_embedder.dimension}) "
 87 |                                  f"doesn't match specified dimension ({dimension})")
 88 |                     self.dimension = self._text_embedder.dimension
 89 |             except ImportError:
 90 |                 logger.warning("Text embeddings disabled: sentence-transformers not available")
 91 |                 self._text_embedder = None
 92 |         
 93 |         # Performance tracking
 94 |         self._stats = {
 95 |             "collections_created": 0,
 96 |             "total_vectors": 0,
 97 |             "total_documents": 0,
 98 |             "operations_performed": 0
 99 |         }
100 |         
101 |         logger.info(f"OctaneDB initialized with dimension {self.dimension}")
102 |         if self._text_embedder:
103 |             logger.info(f"Text embeddings enabled with model: {embedding_model}")
104 |     
105 |     def create_collection(
106 |         self, 
107 |         name: str, 
108 |         **kwargs
109 |     ) -> Collection:
110 |         """
111 |         Create a new collection.
112 |         
113 |         Args:
114 |             name: Collection name
115 |             **kwargs: Additional collection parameters
116 |             
117 |         Returns:
118 |             Created collection instance
119 |         """
120 |         if name in self._collections:
121 |             raise ValueError(f"Collection '{name}' already exists")
122 |         
123 |         # Use embedding model from OctaneDB if not specified
124 |         if "embedding_model" not in kwargs and self.embedding_model:
125 |             kwargs["embedding_model"] = self.embedding_model
126 |         
127 |         # Create collection with text embedding support
128 |         collection = Collection(
129 |             name=name,
130 |             dimension=self.dimension,
131 |             index_type=self.index_type,
132 |             m=self.m,
133 |             ef_construction=self.ef_construction,
134 |             ef_search=self.ef_search,
135 |             max_elements=self.max_elements,
136 |             distance_metric=self.distance_metric,
137 |             enable_text_embeddings=self.enable_text_embeddings,
138 |             **kwargs
139 |         )
140 |         
141 |         self._collections[name] = collection
142 |         
143 |         # Set as current collection if it's the first one
144 |         if self._current_collection is None:
145 |             self._current_collection = collection
146 |         
147 |         # Update stats
148 |         self._stats["collections_created"] += 1
149 |         
150 |         logger.info(f"Collection '{name}' created successfully")
151 |         return collection
152 |     
153 |     def use_collection(self, name: str) -> Collection:
154 |         """
155 |         Set the current collection for operations.
156 |         
157 |         Args:
158 |             name: Collection name
159 |             
160 |         Returns:
161 |             Collection instance
162 |         """
163 |         if name not in self._collections:
164 |             raise ValueError(f"Collection '{name}' does not exist")
165 |         
166 |         self._current_collection = self._collections[name]
167 |         logger.debug(f"Switched to collection: {name}")
168 |         return self._current_collection
169 |     
170 |     def get_collection(self, name: str) -> Collection:
171 |         """
172 |         Get a collection by name.
173 |         
174 |         Args:
175 |             name: Collection name
176 |             
177 |         Returns:
178 |             Collection instance
179 |         """
180 |         if name not in self._collections:
181 |             raise ValueError(f"Collection '{name}' does not exist")
182 |         
183 |         return self._collections[name]
184 |     
185 |     def list_collections(self) -> List[str]:
186 |         """List all collection names."""
187 |         return list(self._collections.keys())
188 |     
189 |     def delete_collection(self, name: str) -> bool:
190 |         """
191 |         Delete a collection.
192 |         
193 |         Args:
194 |             name: Collection name
195 |             
196 |         Returns:
197 |             True if deleted, False if not found
198 |         """
199 |         if name not in self._collections:
200 |             return False
201 |         
202 |         # Clear current collection if it's the one being deleted
203 |         if self._current_collection == self._collections[name]:
204 |             self._current_collection = None
205 |         
206 |         # Delete collection
207 |         del self._collections[name]
208 |         
209 |         # Set new current collection if needed
210 |         if self._current_collection is None and self._collections:
211 |             self._current_collection = list(self._collections.values())[0]
212 |         
213 |         logger.info(f"Collection '{name}' deleted")
214 |         return True
215 |     
216 |     def insert(
217 |         self, 
218 |         vectors: Union[np.ndarray, List], 
219 |         metadata: Optional[List[Dict[str, Any]]] = None,
220 |         ids: Optional[List[int]] = None
221 |     ) -> Union[int, List[int]]:
222 |         """
223 |         Insert vectors into the current collection.
224 |         
225 |         Args:
226 |             vectors: Vector(s) to insert
227 |             metadata: Optional metadata for each vector
228 |             ids: Optional custom IDs
229 |             
230 |         Returns:
231 |             Inserted vector ID(s)
232 |         """
233 |         if self._current_collection is None:
234 |             raise RuntimeError("No current collection. Use create_collection() or use_collection() first.")
235 |         
236 |         result = self._current_collection.insert(vectors, metadata, ids)
237 |         self._stats["operations_performed"] += 1
238 |         self._update_total_stats()
239 |         return result
240 |     
241 |     def add(
242 |         self,
243 |         ids: Optional[Union[str, List[str]]] = None,
244 |         documents: Union[str, List[str]] = None,
245 |         metadatas: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
246 |         embeddings: Optional[Union[List[float], List[List[float]]]] = None
247 |     ) -> Dict[str, Any]:
248 |         """
249 |         ChromaDB-compatible add method for text documents.
250 |         
251 |         Args:
252 |             ids: Document IDs (auto-generated if not provided)
253 |             documents: Text documents to add
254 |             metadatas: Optional metadata for documents
255 |             embeddings: Pre-computed embeddings (optional)
256 |             
257 |         Returns:
258 |             Dictionary with 'ids' and 'embeddings' keys
259 |         """
260 |         if self._current_collection is None:
261 |             raise RuntimeError("No current collection. Use create_collection() or use_collection() first.")
262 |         
263 |         result = self._current_collection.add(ids, documents, metadatas, embeddings)
264 |         self._stats["operations_performed"] += 1
265 |         self._update_total_stats()
266 |         return result
267 |     
268 |     def add_text_documents(
269 |         self,
270 |         documents: Union[str, List[str]],
271 |         ids: Optional[Union[str, List[str]]] = None,
272 |         metadatas: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None,
273 |         batch_size: int = 32,
274 |         show_progress_bar: bool = False
275 |     ) -> Dict[str, Any]:
276 |         """
277 |         Add text documents with automatic embedding generation.
278 |         
279 |         Args:
280 |             documents: Text document(s) to add
281 |             ids: Document IDs (auto-generated if not provided)
282 |             metadatas: Optional metadata for documents
283 |             batch_size: Batch size for embedding generation
284 |             show_progress_bar: Whether to show progress bar
285 |             
286 |         Returns:
287 |             Dictionary with 'ids', 'embeddings', and 'documents' keys
288 |         """
289 |         if self._current_collection is None:
290 |             raise RuntimeError("No current collection. Use create_collection() or use_collection() first.")
291 |         
292 |         result = self._current_collection.add_text_documents(
293 |             documents, ids, metadatas, batch_size, show_progress_bar
294 |         )
295 |         self._stats["operations_performed"] += 1
296 |         self._update_total_stats()
297 |         return result
298 |     
299 |     def search(
300 |         self, 
301 |         query_vector: np.ndarray, 
302 |         k: int = 10, 
303 |         filter: Optional[Dict[str, Any]] = None,
304 |         include_metadata: bool = False
305 |     ) -> List[Tuple[int, float, Optional[Dict[str, Any]]]]:
306 |         """
307 |         Search for similar vectors in the current collection.
308 |         
309 |         Args:
310 |             query_vector: Query vector
311 |             k: Number of results to return
312 |             filter: Optional metadata filter
313 |             include_metadata: Whether to include metadata in results
314 |             
315 |         Returns:
316 |             List of (id, distance, metadata) tuples
317 |         """
318 |         if self._current_collection is None:
319 |             raise RuntimeError("No current collection. Use create_collection() or use_collection() first.")
320 |         
321 |         result = self._current_collection.search(query_vector, k, filter, include_metadata)
322 |         self._stats["operations_performed"] += 1
323 |         return result
324 |     
325 |     def search_text(
326 |         self,
327 |         query_text: str,
328 |         k: int = 10,
329 |         filter: Optional[Dict[str, Any]] = None,
330 |         include_metadata: bool = False
331 |     ) -> List[Tuple[int, float, Optional[Dict[str, Any]]]]:
332 |         """
333 |         Search for similar documents using text query.
334 |         
335 |         Args:
336 |             query_text: Text query to search for
337 |             k: Number of results to return
338 |             filter: Optional metadata filter
339 |             include_metadata: Whether to include metadata in results
340 |             
341 |         Returns:
342 |             List of (id, distance, metadata) tuples
343 |         """
344 |         if self._current_collection is None:
345 |             raise RuntimeError("No current collection. Use create_collection() or use_collection() first.")
346 |         
347 |         result = self._current_collection.search_text(query_text, k, filter, include_metadata)
348 |         self._stats["operations_performed"] += 1
349 |         return result
350 |     
351 |     def search_batch(
352 |         self, 
353 |         query_vectors: np.ndarray, 
354 |         k: int = 10, 
355 |         filter: Optional[Dict[str, Any]] = None,
356 |         include_metadata: bool = False
357 |     ) -> List[List[Tuple[int, float, Optional[Dict[str, Any]]]]]:
358 |         """
359 |         Batch search for similar vectors.
360 |         
361 |         Args:
362 |             query_vectors: Query vectors
363 |             k: Number of results per query
364 |             filter: Optional metadata filter
365 |             include_metadata: Whether to include metadata in results
366 |             
367 |         Returns:
368 |             List of result lists for each query
369 |         """
370 |         if self._current_collection is None:
371 |             raise RuntimeError("No current collection. Use create_collection() or use_collection() first.")
372 |         
373 |         result = self._current_collection.search_batch(query_vectors, k, filter, include_metadata)
374 |         self._stats["operations_performed"] += 1
375 |         return result
376 |     
377 |     def search_text_batch(
378 |         self,
379 |         query_texts: List[str],
380 |         k: int = 10,
381 |         filter: Optional[Dict[str, Any]] = None,
382 |         include_metadata: bool = False,
383 |         batch_size: int = 32
384 |     ) -> List[List[Tuple[int, float, Optional[Dict[str, Any]]]]]:
385 |         """
386 |         Batch search for similar documents using text queries.
387 |         
388 |         Args:
389 |             query_texts: List of text queries
390 |             k: Number of results per query
391 |             filter: Optional metadata filter
392 |             include_metadata: Whether to include metadata in results
393 |             batch_size: Batch size for embedding generation
394 |             
395 |         Returns:
396 |             List of result lists for each query
397 |         """
398 |         if self._current_collection is None:
399 |             raise RuntimeError("No current collection. Use create_collection() or use_collection() first.")
400 |         
401 |         result = self._current_collection.search_text_batch(
402 |             query_texts, k, filter, include_metadata, batch_size
403 |         )
404 |         self._stats["operations_performed"] += 1
405 |         return result
406 |     
407 |     def update(self, id: int, vector: np.ndarray, metadata: Optional[Dict[str, Any]] = None) -> None:
408 |         """
409 |         Update a vector in the current collection.
410 |         
411 |         Args:
412 |             id: Vector ID to update
413 |             vector: New vector
414 |             metadata: New metadata
415 |         """
416 |         if self._current_collection is None:
417 |             raise RuntimeError("No current collection. Use create_collection() or use_collection() first.")
418 |         
419 |         self._current_collection.update(id, vector, metadata)
420 |         self._stats["operations_performed"] += 1
421 |     
422 |     def delete(self, id: int) -> None:
423 |         """
424 |         Delete a vector from the current collection.
425 |         
426 |         Args:
427 |             id: Vector ID to delete
428 |         """
429 |         if self._current_collection is None:
430 |             raise RuntimeError("No current collection. Use create_collection() or use_collection() first.")
431 |         
432 |         self._current_collection.delete(id)
433 |         self._stats["operations_performed"] += 1
434 |         self._update_total_stats()
435 |     
436 |     def delete_batch(self, ids: List[int]) -> None:
437 |         """
438 |         Batch delete vectors from the current collection.
439 |         
440 |         Args:
441 |             ids: List of vector IDs to delete
442 |         """
443 |         if self._current_collection is None:
444 |             raise RuntimeError("No current collection. Use create_collection() or use_collection() first.")
445 |         
446 |         self._current_collection.delete_batch(ids)
447 |         self._stats["operations_performed"] += 1
448 |         self._update_total_stats()
449 |     
450 |     def get_vector(self, id: int, include_metadata: bool = False) -> Union[np.ndarray, Tuple[np.ndarray, Optional[Dict[str, Any]]]]:
451 |         """
452 |         Get a vector from the current collection.
453 |         
454 |         Args:
455 |             id: Vector ID
456 |             include_metadata: Whether to include metadata
457 |             
458 |         Returns:
459 |             Vector or (vector, metadata) tuple
460 |         """
461 |         if self._current_collection is None:
462 |             raise RuntimeError("No current collection. Use create_collection() or use_collection() first.")
463 |         
464 |         return self._current_collection.get_vector(id, include_metadata)
465 |     
466 |     def get_document(self, doc_id: str) -> Optional[str]:
467 |         """
468 |         Get a text document from the current collection.
469 |         
470 |         Args:
471 |             doc_id: Document ID
472 |             
473 |         Returns:
474 |             Document text or None if not found
475 |         """
476 |         if self._current_collection is None:
477 |             raise RuntimeError("No current collection. Use create_collection() or use_collection() first.")
478 |         
479 |         return self._current_collection.get_document(doc_id)
480 |     
481 |     def list_documents(self) -> List[str]:
482 |         """List all text document IDs in the current collection."""
483 |         if self._current_collection is None:
484 |             raise RuntimeError("No current collection. Use create_collection() or use_collection() first.")
485 |         
486 |         return self._current_collection.list_documents()
487 |     
488 |     def delete_document(self, doc_id: str) -> bool:
489 |         """
490 |         Delete a text document from the current collection.
491 |         
492 |         Args:
493 |             doc_id: Document ID
494 |             
495 |         Returns:
496 |             True if deleted, False if not found
497 |         """
498 |         if self._current_collection is None:
499 |             raise RuntimeError("No current collection. Use create_collection() or use_collection() first.")
500 |         
501 |         result = self._current_collection.delete_document(doc_id)
502 |         if result:
503 |             self._stats["operations_performed"] += 1
504 |             self._update_total_stats()
505 |         return result
506 |     
507 |     def clear_documents(self) -> None:
508 |         """Clear all text documents from the current collection."""
509 |         if self._current_collection is None:
510 |             raise RuntimeError("No current collection. Use create_collection() or use_collection() first.")
511 |         
512 |         self._current_collection.clear_documents()
513 |         self._stats["operations_performed"] += 1
514 |         self._update_total_stats()
515 |     
516 |     def change_embedding_model(self, model_name: str, **kwargs) -> None:
517 |         """
518 |         Change the text embedding model for the current collection.
519 |         
520 |         Args:
521 |             model_name: New sentence-transformers model name
522 |             **kwargs: Additional arguments for TextEmbedder
523 |         """
524 |         if self._current_collection is None:
525 |             raise RuntimeError("No current collection. Use create_collection() or use_collection() first.")
526 |         
527 |         self._current_collection.change_embedding_model(model_name, **kwargs)
528 |         # Update OctaneDB dimension if it changed
529 |         if self._current_collection.dimension != self.dimension:
530 |             self.dimension = self._current_collection.dimension
531 |             logger.info(f"OctaneDB dimension updated to: {self.dimension}")
532 |     
533 |     def get_available_models(self) -> List[str]:
534 |         """Get list of available sentence-transformers models."""
535 |         if not self._text_embedder:
536 |             return []
537 |         return self._text_embedder.get_available_models()
538 |     
539 |     def get_stats(self) -> Dict[str, Any]:
540 |         """Get comprehensive database statistics."""
541 |         stats = self._stats.copy()
542 |         stats["collection_count"] = len(self._collections)
543 |         stats["current_collection"] = self._current_collection.name if self._current_collection else None
544 |         stats["text_embeddings_enabled"] = self._text_embedder is not None
545 |         stats["embedding_model"] = self._text_embedder.model_name if self._text_embedder else None
546 |         
547 |         # Add collection-specific stats
548 |         if self._current_collection:
549 |             collection_stats = self._current_collection.get_stats()
550 |             stats.update(collection_stats)
551 |         
552 |         return stats
553 |     
554 |     def get_collection_info(self, name: Optional[str] = None) -> Dict[str, Any]:
555 |         """
556 |         Get information about a collection.
557 |         
558 |         Args:
559 |             name: Collection name (uses current collection if None)
560 |             
561 |         Returns:
562 |             Collection information dictionary
563 |         """
564 |         if name is None:
565 |             if not self._current_collection:
566 |                 raise RuntimeError("No current collection")
567 |             return self._current_collection.get_collection_info()
568 |         
569 |         if name not in self._collections:
570 |             raise ValueError(f"Collection '{name}' does not exist")
571 |         
572 |         return self._collections[name].get_collection_info()
573 |     
574 |     def _update_total_stats(self) -> None:
575 |         """Update total statistics across all collections."""
576 |         total_vectors = 0
577 |         total_documents = 0
578 |         
579 |         for collection in self._collections.values():
580 |             collection_stats = collection.get_stats()
581 |             total_vectors += collection_stats.get("vector_count", 0)
582 |             total_documents += collection_stats.get("document_count", 0)
583 |         
584 |         self._stats["total_vectors"] = total_vectors
585 |         self._stats["total_documents"] = total_documents
586 |     
587 |     def save(self, path: Optional[Union[str, Path]] = None) -> None:
588 |         """
589 |         Save the database to persistent storage.
590 |         
591 |         Args:
592 |             path: Optional custom save path
593 |         """
594 |         if not self._storage_manager:
595 |             raise RuntimeError("Storage manager not initialized. Set storage_path during initialization.")
596 |         
597 |         save_path = Path(path) if path else self._storage_path
598 |         self._storage_manager.save_database(self, save_path)
599 |         logger.info(f"Database saved to: {save_path}")
600 |     
601 |     @classmethod
602 |     def load(cls, path: str, **kwargs) -> "OctaneDB":
603 |         """
604 |         Load a database from persistent storage.
605 |         
606 |         Args:
607 |             path: Path to the saved database
608 |             **kwargs: Additional initialization parameters
609 |             
610 |         Returns:
611 |             Loaded OctaneDB instance
612 |         """
613 |         path = Path(path)
614 |         if not path.exists():
615 |             raise FileNotFoundError(f"Database file not found: {path}")
616 |         
617 |         storage_manager = StorageManager(storage_path=path.parent)
618 |         metadata = storage_manager.load_database_metadata(path)
619 |         
620 |         # Get dimension from first collection
621 |         collections_metadata = metadata.get("collections", {})
622 |         if not collections_metadata:
623 |             raise ValueError("No collections found in database")
624 |         
625 |         first_collection_name = list(collections_metadata.keys())[0]
626 |         first_collection_meta = collections_metadata[first_collection_name]
627 |         
628 |         # Create instance with loaded parameters
629 |         instance = cls(
630 |             dimension=first_collection_meta["dimension"],
631 |             index_type=first_collection_meta.get("index_type", "hnsw"),
632 |             m=first_collection_meta.get("m", 16),
633 |             ef_construction=first_collection_meta.get("ef_construction", 200),
634 |             ef_search=first_collection_meta.get("ef_search", 100),
635 |             max_elements=first_collection_meta.get("max_elements", 1000000),
636 |             distance_metric=first_collection_meta.get("distance_metric", "cosine"),
637 |             storage_path=path.parent,
638 |             **kwargs
639 |         )
640 |         
641 |         # Load collections
642 |         storage_manager.load_database(instance, path)
643 |         
644 |         logger.info(f"Database loaded from: {path}")
645 |         return instance
646 |     
647 |     def __enter__(self):
648 |         """Context manager entry."""
649 |         return self
650 |     
651 |     def __exit__(self, exc_type, exc_val, exc_tb):
652 |         """Context manager exit."""
653 |         if self._storage_manager:
654 |             self.save()
655 |     
656 |     def __len__(self) -> int:
657 |         """Return total number of vectors across all collections."""
658 |         return sum(len(collection) for collection in self._collections.values())
659 |     
660 |     def __contains__(self, collection_name: str) -> bool:
661 |         """Check if a collection exists."""
662 |         return collection_name in self._collections
663 | 


--------------------------------------------------------------------------------
/octanedb/collection.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Collection management for OctaneDB.
  3 | Handles vector storage, indexing, and operations within a collection.
  4 | """
  5 | 
  6 | import logging
  7 | import time
  8 | from typing import Dict, List, Optional, Union, Any, Tuple
  9 | import numpy as np
 10 | 
 11 | from .index import HNSWIndex
 12 | from .embeddings import TextEmbedder, ChromaCompatibleEmbedder
 13 | 
 14 | logger = logging.getLogger(__name__)
 15 | 
 16 | 
 17 | class Collection:
 18 |     """
 19 |     A collection of vectors with metadata and indexing capabilities.
 20 |     
 21 |     Features:
 22 |     - Vector storage and retrieval
 23 |     - Metadata management
 24 |     - Automatic indexing (HNSW)
 25 |     - Text embedding generation
 26 |     - ChromaDB-compatible API
 27 |     """
 28 |     
 29 |     def __init__(
 30 |         self,
 31 |         name: str,
 32 |         dimension: int,
 33 |         index_type: str = "hnsw",
 34 |         m: int = 16,
 35 |         ef_construction: int = 200,
 36 |         ef_search: int = 100,
 37 |         max_elements: int = 1000000,
 38 |         distance_metric: str = "cosine",
 39 |         storage_manager=None,
 40 |         query_engine=None,
 41 |         vector_utils=None,
 42 |         embedding_model=None,
 43 |         enable_text_embeddings: bool = True
 44 |     ):
 45 |         """
 46 |         Initialize a collection.
 47 |         
 48 |         Args:
 49 |             name: Collection name
 50 |             dimension: Vector dimension
 51 |             index_type: Type of index to use
 52 |             m: HNSW connections per layer
 53 |             ef_construction: Construction search depth
 54 |             ef_search: Search depth
 55 |             max_elements: Maximum number of vectors
 56 |             distance_metric: Distance metric for similarity
 57 |             storage_manager: Storage manager instance
 58 |             query_engine: Query engine instance
 59 |             vector_utils: Vector utilities instance
 60 |             embedding_model: Sentence-transformers model name for text embeddings
 61 |             enable_text_embeddings: Whether to enable text embedding functionality
 62 |         """
 63 |         self.name = name
 64 |         self.dimension = dimension
 65 |         self.index_type = index_type
 66 |         self.m = m
 67 |         self.ef_construction = ef_construction
 68 |         self.ef_search = ef_search
 69 |         self.max_elements = max_elements
 70 |         self.distance_metric = distance_metric
 71 |         
 72 |         # Initialize components
 73 |         self._storage_manager = storage_manager
 74 |         self._query_engine = query_engine
 75 |         self._vector_utils = vector_utils
 76 |         
 77 |         # Vector storage
 78 |         self._vectors: Dict[int, np.ndarray] = {}
 79 |         self._metadata: Dict[int, Dict[str, Any]] = {}
 80 |         self._next_id = 0
 81 |         
 82 |         # Text document storage
 83 |         self._documents: Dict[int, str] = {}
 84 |         self._text_embedder = None
 85 |         
 86 |         # Initialize text embeddings if enabled
 87 |         if enable_text_embeddings and embedding_model:
 88 |             try:
 89 |                 self._text_embedder = TextEmbedder(embedding_model)
 90 |                 # Update dimension if it doesn't match
 91 |                 if self._text_embedder.dimension != dimension:
 92 |                     logger.warning(f"Embedding model dimension ({self._text_embedder.dimension}) "
 93 |                                  f"doesn't match collection dimension ({dimension})")
 94 |                     self.dimension = self._text_embedder.dimension
 95 |             except ImportError:
 96 |                 logger.warning("Text embeddings disabled: sentence-transformers not available")
 97 |                 self._text_embedder = None
 98 |         
 99 |         # Index management
100 |         self._index: Optional[HNSWIndex] = None
101 |         self._index_built = False
102 |         self._index_needs_rebuild = False
103 |         
104 |         # Performance tracking
105 |         self._stats = {
106 |             "inserts": 0,
107 |             "searches": 0,
108 |             "updates": 0,
109 |             "deletes": 0,
110 |             "text_documents": 0,
111 |             "index_builds": 0,
112 |             "last_index_build": None
113 |         }
114 |         
115 |         # Initialize index
116 |         self._init_index()
117 |         
118 |         logger.info(f"Collection '{name}' initialized with dimension {dimension}")
119 |     
120 |     def _init_index(self) -> None:
121 |         """Initialize the vector index."""
122 |         if self.index_type == "hnsw":
123 |             self._index = HNSWIndex(
124 |                 dimension=self.dimension,
125 |                 m=self.m,
126 |                 ef_construction=self.ef_construction,
127 |                 ef_search=self.ef_search,
128 |                 max_elements=self.max_elements,
129 |                 distance_metric=self.distance_metric
130 |             )
131 |         else:
132 |             raise ValueError(f"Unsupported index type: {self.index_type}")
133 |     
134 |     def insert(
135 |         self, 
136 |         vectors, 
137 |         metadata=None,
138 |         ids=None
139 |     ):
140 |         """
141 |         Insert vectors into the collection.
142 |         
143 |         Args:
144 |             vectors: Vector(s) to insert
145 |             metadata: Optional metadata for each vector
146 |             ids: Optional custom IDs
147 |             
148 |         Returns:
149 |             Inserted vector ID(s)
150 |         """
151 |         # Convert to numpy array if needed
152 |         if isinstance(vectors, list):
153 |             vectors = np.array(vectors, dtype=np.float32)
154 |         elif not isinstance(vectors, np.ndarray):
155 |             vectors = np.array([vectors], dtype=np.float32)
156 |         
157 |         # Ensure 2D array
158 |         if vectors.ndim == 1:
159 |             vectors = vectors.reshape(1, -1)
160 |         
161 |         # Validate dimensions
162 |         if vectors.shape[1] != self.dimension:
163 |             raise ValueError(f"Vector dimension {vectors.shape[1]} does not match collection dimension {self.dimension}")
164 |         
165 |         # Handle metadata
166 |         if metadata is None:
167 |             metadata = [{} for _ in range(len(vectors))]
168 |         elif not isinstance(metadata, list):
169 |             metadata = [metadata]
170 |         
171 |         # Handle IDs
172 |         if ids is None:
173 |             ids = [self._next_id + i for i in range(len(vectors))]
174 |         elif not isinstance(ids, list):
175 |             ids = [ids]
176 |         
177 |         # Validate lengths
178 |         if len(vectors) != len(metadata) or len(vectors) != len(ids):
179 |             raise ValueError("Vectors, metadata, and IDs must have the same length")
180 |         
181 |         # Insert vectors
182 |         inserted_ids = []
183 |         for i, (vector, meta, vector_id) in enumerate(zip(vectors, metadata, ids)):
184 |             # Check if ID already exists
185 |             if vector_id in self._vectors:
186 |                 raise ValueError(f"Vector ID {vector_id} already exists")
187 |             
188 |             # Store vector and metadata
189 |             self._vectors[vector_id] = vector.copy()
190 |             self._metadata[vector_id] = meta.copy()
191 |             inserted_ids.append(vector_id)
192 |             
193 |             # Update next_id (only for integer IDs)
194 |             if isinstance(vector_id, int):
195 |                 self._next_id = max(self._next_id, vector_id + 1)
196 |         
197 |         # Mark index for rebuild
198 |         self._index_needs_rebuild = True
199 |         
200 |         # Update stats
201 |         self._stats["inserts"] += len(vectors)
202 |         
203 |         logger.debug(f"Inserted {len(vectors)} vectors into collection '{self.name}'")
204 |         
205 |         # Return single ID or list based on input
206 |         return inserted_ids[0] if len(inserted_ids) == 1 else inserted_ids
207 |     
208 |     def add_text_documents(
209 |         self,
210 |         documents,
211 |         ids=None,
212 |         metadatas=None,
213 |         batch_size: int = 32,
214 |         show_progress_bar: bool = False
215 |     ):
216 |         """
217 |         Add text documents with automatic embedding generation (ChromaDB-compatible).
218 |         
219 |         Args:
220 |             documents: Text document(s) to add
221 |             ids: Document IDs (auto-generated if not provided)
222 |             metadatas: Optional metadata for documents
223 |             batch_size: Batch size for embedding generation
224 |             show_progress_bar: Whether to show progress bar
225 |             
226 |         Returns:
227 |             Dictionary with 'ids', 'embeddings', and 'documents' keys
228 |         """
229 |         if not self._text_embedder:
230 |             raise RuntimeError("Text embeddings not enabled. Set embedding_model during collection creation.")
231 |         
232 |         # Ensure documents is a list
233 |         if isinstance(documents, str):
234 |             documents = [documents]
235 |         
236 |         if not documents:
237 |             raise ValueError("No documents provided")
238 |         
239 |         num_docs = len(documents)
240 |         
241 |         # Handle IDs
242 |         if ids is None:
243 |             # Auto-generate string IDs
244 |             ids = [f"doc_{self._next_id + i}" for i in range(num_docs)]
245 |         elif isinstance(ids, str):
246 |             ids = [ids]
247 |         
248 |         if len(ids) != num_docs:
249 |             raise ValueError("Number of IDs must match number of documents")
250 |         
251 |         # Handle metadata
252 |         if metadatas is None:
253 |             metadatas = [{} for _ in range(num_docs)]
254 |         elif isinstance(metadatas, dict):
255 |             metadatas = [metadatas for _ in range(num_docs)]
256 |         
257 |         if len(metadatas) != num_docs:
258 |             raise ValueError("Number of metadatas must match number of documents")
259 |         
260 |         # Generate embeddings
261 |         logger.info(f"Generating embeddings for {num_docs} documents...")
262 |         embeddings = self._text_embedder.embed_texts(
263 |             documents, 
264 |             batch_size=batch_size,
265 |             show_progress_bar=show_progress_bar
266 |         )
267 |         
268 |         # Store documents
269 |         for doc_id, doc_text in zip(ids, documents):
270 |             self._documents[doc_id] = doc_text
271 |         
272 |         # Insert vectors into collection
273 |         vector_ids = self.insert(
274 |             vectors=embeddings,
275 |             metadata=metadatas,
276 |             ids=ids
277 |         )
278 |         
279 |         # Update stats
280 |         self._stats["text_documents"] += num_docs
281 |         
282 |         # Prepare result
283 |         result = {
284 |             "ids": ids,
285 |             "embeddings": embeddings,
286 |             "documents": documents,
287 |             "metadatas": metadatas,
288 |             "vector_ids": vector_ids
289 |         }
290 |         
291 |         logger.info(f"Added {num_docs} text documents with IDs: {ids}")
292 |         return result
293 |     
294 |     def add(
295 |         self,
296 |         ids=None,
297 |         documents=None,
298 |         metadatas=None,
299 |         embeddings=None
300 |     ):
301 |         """
302 |         ChromaDB-compatible add method for text documents.
303 |         
304 |         Args:
305 |             ids: Document IDs (auto-generated if not provided)
306 |             documents: Text documents to add
307 |             metadatas: Optional metadata for documents
308 |             embeddings: Pre-computed embeddings (optional)
309 |             
310 |         Returns:
311 |             Dictionary with 'ids' and 'embeddings' keys
312 |         """
313 |         if documents is not None:
314 |             # Use text document processing
315 |             return self.add_text_documents(documents, ids, metadatas)
316 |         elif embeddings is not None:
317 |             # Use pre-computed embeddings
318 |             if isinstance(embeddings, list) and isinstance(embeddings[0], (int, float)):
319 |                 embeddings = [embeddings]
320 |             embeddings = np.array(embeddings, dtype=np.float32)
321 |             
322 |             # Handle IDs
323 |             if ids is None:
324 |                 ids = [f"vec_{self._next_id + i}" for i in range(len(embeddings))]
325 |             elif isinstance(ids, str):
326 |                 ids = [ids]
327 |             
328 |             # Handle metadata
329 |             if metadatas is None:
330 |                 metadatas = [{} for _ in range(len(embeddings))]
331 |             elif isinstance(metadatas, dict):
332 |                 metadatas = [metadatas for _ in range(len(embeddings))]
333 |             
334 |             # Insert vectors
335 |             vector_ids = self.insert(vectors=embeddings, metadata=metadatas, ids=ids)
336 |             
337 |             return {
338 |                 "ids": ids,
339 |                 "embeddings": embeddings,
340 |                 "vector_ids": vector_ids
341 |             }
342 |         else:
343 |             raise ValueError("Either 'documents' or 'embeddings' must be provided")
344 |     
345 |     def get_document(self, doc_id: str):
346 |         """Get a text document by ID."""
347 |         return self._documents.get(doc_id)
348 |     
349 |     def list_documents(self):
350 |         """List all text document IDs."""
351 |         return list(self._documents.keys())
352 |     
353 |     def delete_document(self, doc_id: str) -> bool:
354 |         """Delete a text document by ID."""
355 |         if doc_id in self._documents:
356 |             del self._documents[doc_id]
357 |             # Also delete the corresponding vector if it exists
358 |             if doc_id in self._vectors:
359 |                 del self._vectors[doc_id]
360 |             if doc_id in self._metadata:
361 |                 del self._metadata[doc_id]
362 |             logger.info(f"Deleted document: {doc_id}")
363 |             return True
364 |         return False
365 |     
366 |     def clear_documents(self):
367 |         """Clear all text documents and vectors."""
368 |         self._documents.clear()
369 |         self._vectors.clear()
370 |         self._metadata.clear()
371 |         self._next_id = 0
372 |         self._index_built = False
373 |         self._index_needs_rebuild = False
374 |         logger.info("Cleared all documents and vectors")
375 |     
376 |     def get_collection_info(self):
377 |         """Get comprehensive information about the collection."""
378 |         return {
379 |             "name": self.name,
380 |             "dimension": self.dimension,
381 |             "num_vectors": len(self._vectors),
382 |             "num_documents": len(self._documents),
383 |             "index_type": self.index_type,
384 |             "distance_metric": self.distance_metric,
385 |             "index_built": self._index_built,
386 |             "text_embeddings_enabled": self._text_embedder is not None,
387 |             "embedding_model": self._text_embedder.model_name if self._text_embedder else None,
388 |             "stats": self._stats.copy()
389 |         }
390 |     
391 |     def change_embedding_model(self, model_name: str, **kwargs):
392 |         """
393 |         Change the text embedding model.
394 |         
395 |         Args:
396 |             model_name: New sentence-transformers model name
397 |             **kwargs: Additional arguments for TextEmbedder
398 |         """
399 |         if not self._text_embedder:
400 |             raise RuntimeError("Text embeddings not enabled")
401 |         
402 |         old_dimension = self.dimension
403 |         self._text_embedder.change_model(model_name, **kwargs)
404 |         
405 |         # Update collection dimension if it changed
406 |         if self._text_embedder.dimension != old_dimension:
407 |             logger.warning(f"Embedding model dimension changed from {old_dimension} to {self._text_embedder.dimension}")
408 |             self.dimension = self._text_embedder.dimension
409 |             # Rebuild index with new dimension
410 |             self._index_needs_rebuild = True
411 |         
412 |         logger.info(f"Embedding model changed to: {model_name}")
413 |     
414 |     def search(
415 |         self, 
416 |         query_vector, 
417 |         k: int = 10, 
418 |         filter=None,
419 |         include_metadata: bool = False
420 |     ):
421 |         """
422 |         Search for similar vectors.
423 |         
424 |         Args:
425 |             query_vector: Query vector
426 |             k: Number of results to return
427 |             filter: Optional metadata filter
428 |             include_metadata: Whether to include metadata in results
429 |             
430 |         Returns:
431 |             List of (id, distance, metadata) tuples
432 |         """
433 |         # Ensure index is built
434 |         if not self._index_built or self._index_needs_rebuild:
435 |             self._build_index()
436 |         
437 |         # Validate query vector
438 |         if query_vector.shape[0] != self.dimension:
439 |             raise ValueError(f"Query vector dimension {query_vector.shape[0]} does not match collection dimension {self.dimension}")
440 |         
441 |         # Search using index
442 |         start_time = time.time()
443 |         results = self._index.search(query_vector, k)
444 |         search_time = time.time() - start_time
445 |         
446 |         # Apply filters if specified
447 |         if filter:
448 |             results = self._apply_filter(results, filter)
449 |         
450 |         # Format results
451 |         formatted_results = []
452 |         for vector_id, distance in results:
453 |             metadata = self._metadata.get(vector_id) if include_metadata else None
454 |             formatted_results.append((vector_id, distance, metadata))
455 |         
456 |         # Update stats
457 |         self._stats["searches"] += 1
458 |         
459 |         logger.debug(f"Search completed in {search_time:.4f}s, found {len(formatted_results)} results")
460 |         
461 |         return formatted_results
462 |     
463 |     def search_text(
464 |         self,
465 |         query_text: str,
466 |         k: int = 10,
467 |         filter=None,
468 |         include_metadata: bool = False
469 |     ):
470 |         """
471 |         Search for similar documents using text query.
472 |         
473 |         Args:
474 |             query_text: Text query to search for
475 |             k: Number of results to return
476 |             filter: Optional metadata filter
477 |             include_metadata: Whether to include metadata in results
478 |             
479 |         Returns:
480 |             List of (id, distance, metadata) tuples
481 |         """
482 |         if not self._text_embedder:
483 |             raise RuntimeError("Text embeddings not enabled")
484 |         
485 |         # Generate embedding for query text
486 |         query_embedding = self._text_embedder.embed_texts(query_text)[0]
487 |         
488 |         # Search using the embedding
489 |         return self.search(
490 |             query_vector=query_embedding,
491 |             k=k,
492 |             filter=filter,
493 |             include_metadata=include_metadata
494 |         )
495 |     
496 |     def search_batch(
497 |         self, 
498 |         query_vectors, 
499 |         k: int = 10, 
500 |         filter=None,
501 |         include_metadata: bool = False
502 |     ):
503 |         """
504 |         Batch search for similar vectors.
505 |         
506 |         Args:
507 |             query_vectors: Query vectors
508 |             k: Number of results per query
509 |             filter: Optional metadata filter
510 |             include_metadata: Whether to include metadata in results
511 |             
512 |         Returns:
513 |             List of result lists for each query
514 |         """
515 |         # Ensure index is built
516 |         if not self._index_built or self._index_needs_rebuild:
517 |             self._build_index()
518 |         
519 |         # Validate query vectors
520 |         if query_vectors.ndim == 1:
521 |             query_vectors = query_vectors.reshape(1, -1)
522 |         
523 |         if query_vectors.shape[1] != self.dimension:
524 |             raise ValueError(f"Query vector dimension {query_vectors.shape[1]} does not match collection dimension {self.dimension}")
525 |         
526 |         # Batch search using index
527 |         start_time = time.time()
528 |         batch_results = self._index.search_batch(query_vectors, k)
529 |         search_time = time.time() - start_time
530 |         
531 |         # Apply filters and format results
532 |         formatted_batch_results = []
533 |         for results in batch_results:
534 |             if filter:
535 |                 results = self._apply_filter(results, filter)
536 |             
537 |             formatted_results = []
538 |             for vector_id, distance in results:
539 |                 metadata = self._metadata.get(vector_id) if include_metadata else None
540 |                 formatted_results.append((vector_id, distance, metadata))
541 |             
542 |             formatted_batch_results.append(formatted_results)
543 |         
544 |         # Update stats
545 |         self._stats["searches"] += 1
546 |         
547 |         logger.debug(f"Batch search completed in {search_time:.4f}s for {len(query_vectors)} queries")
548 |         
549 |         return formatted_batch_results
550 |     
551 |     def search_text_batch(
552 |         self,
553 |         query_texts,
554 |         k: int = 10,
555 |         filter=None,
556 |         include_metadata: bool = False,
557 |         batch_size: int = 32
558 |     ):
559 |         """
560 |         Batch search for similar documents using text queries.
561 |         
562 |         Args:
563 |             query_texts: List of text queries
564 |             k: Number of results per query
565 |             filter: Optional metadata filter
566 |             include_metadata: Whether to include metadata in results
567 |             batch_size: Batch size for embedding generation
568 |             
569 |         Returns:
570 |             List of result lists for each query
571 |         """
572 |         if not self._text_embedder:
573 |             raise RuntimeError("Text embeddings not enabled")
574 |         
575 |         # Generate embeddings for all query texts
576 |         query_embeddings = self._text_embedder.embed_texts(
577 |             query_texts, 
578 |             batch_size=batch_size
579 |         )
580 |         
581 |         # Search using the embeddings
582 |         return self.search_batch(
583 |             query_vectors=query_embeddings,
584 |             k=k,
585 |             filter=filter,
586 |             include_metadata=include_metadata
587 |         )
588 |     
589 |     def _apply_filter(self, results, filter):
590 |         """
591 |         Apply metadata filter to search results.
592 |         
593 |         Args:
594 |             results: Search results (id, distance) tuples
595 |             filter: Filter criteria
596 |             
597 |         Returns:
598 |             Filtered results
599 |         """
600 |         if not self._query_engine:
601 |             logger.warning("Query engine not available, returning unfiltered results")
602 |             return results
603 |         
604 |         # Apply filter using query engine
605 |         filtered_results = []
606 |         for vector_id, distance in results:
607 |             metadata = self._metadata.get(vector_id, {})
608 |             if self._query_engine.evaluate(metadata, filter):
609 |                 filtered_results.append((vector_id, distance))
610 |         
611 |         return filtered_results
612 |     
613 |     def _build_index(self) -> None:
614 |         """Build or rebuild the vector index."""
615 |         if not self._vectors:
616 |             logger.warning("No vectors to index")
617 |             return
618 |         
619 |         start_time = time.time()
620 |         
621 |         # Convert vectors to array
622 |         vector_ids = list(self._vectors.keys())
623 |         vectors_array = np.array([self._vectors[vid] for vid in vector_ids], dtype=np.float32)
624 |         
625 |         # Build index
626 |         self._index.build(vectors_array, vector_ids)
627 |         
628 |         # Update status
629 |         self._index_built = True
630 |         self._index_needs_rebuild = False
631 |         
632 |         build_time = time.time() - start_time
633 |         
634 |         # Update stats
635 |         self._stats["index_builds"] += 1
636 |         self._stats["last_index_build"] = time.time()
637 |         
638 |         logger.info(f"Index built for {len(vectors_array)} vectors in {build_time:.4f}s")
639 |     
640 |     def update(self, id: int, vector, metadata=None) -> None:
641 |         """
642 |         Update a vector.
643 |         
644 |         Args:
645 |             id: Vector ID to update
646 |             vector: New vector
647 |             metadata: New metadata
648 |         """
649 |         if id not in self._vectors:
650 |             raise ValueError(f"Vector ID {id} does not exist")
651 |         
652 |         # Validate vector dimension
653 |         if vector.shape[0] != self.dimension:
654 |             raise ValueError(f"Vector dimension {vector.shape[0]} does not match collection dimension {self.dimension}")
655 |         
656 |         # Update vector and metadata
657 |         self._vectors[id] = vector.copy()
658 |         if metadata is not None:
659 |             self._metadata[id] = metadata.copy()
660 |         
661 |         # Mark index for rebuild
662 |         self._index_needs_rebuild = True
663 |         
664 |         # Update stats
665 |         self._stats["updates"] += 1
666 |         
667 |         logger.debug(f"Updated vector {id} in collection '{self.name}'")
668 |     
669 |     def delete(self, id: int) -> None:
670 |         """
671 |         Delete a vector.
672 |         
673 |         Args:
674 |             id: Vector ID to delete
675 |         """
676 |         if id not in self._vectors:
677 |             raise ValueError(f"Vector ID {id} does not exist")
678 |         
679 |         # Remove vector and metadata
680 |         del self._vectors[id]
681 |         if id in self._metadata:
682 |             del self._metadata[id]
683 |         
684 |         # Mark index for rebuild
685 |         self._index_needs_rebuild = True
686 |         
687 |         # Update stats
688 |         self._stats["deletes"] += 1
689 |         
690 |         logger.debug(f"Deleted vector {id} from collection '{self.name}'")
691 |     
692 |     def delete_batch(self, ids) -> None:
693 |         """
694 |         Batch delete vectors.
695 |         
696 |         Args:
697 |             ids: List of vector IDs to delete
698 |         """
699 |         for vector_id in ids:
700 |             self.delete(vector_id)
701 |     
702 |     def get_vector(self, id: int, include_metadata: bool = False):
703 |         """
704 |         Get a vector by ID.
705 |         
706 |         Args:
707 |             id: Vector ID
708 |             include_metadata: Whether to include metadata
709 |             
710 |         Returns:
711 |             Vector or (vector, metadata) tuple
712 |         """
713 |         if id not in self._vectors:
714 |             raise ValueError(f"Vector ID {id} does not exist")
715 |         
716 |         vector = self._vectors[id]
717 |         if include_metadata:
718 |             metadata = self._metadata.get(id)
719 |             return vector, metadata
720 |         else:
721 |             return vector
722 |     
723 |     def count(self) -> int:
724 |         """Get total number of vectors in the collection."""
725 |         return len(self._vectors)
726 |     
727 |     def get_stats(self):
728 |         """Get collection statistics."""
729 |         stats = self._stats.copy()
730 |         stats["vector_count"] = len(self._vectors)
731 |         stats["metadata_count"] = len(self._metadata)
732 |         stats["document_count"] = len(self._documents)
733 |         stats["index_built"] = self._index_built
734 |         stats["index_needs_rebuild"] = self._index_needs_rebuild
735 |         return stats
736 |     
737 |     def optimize_index(self) -> None:
738 |         """Optimize the collection's index."""
739 |         if not self._index_built:
740 |             logger.warning("Index not built yet")
741 |             return
742 |         
743 |         start_time = time.time()
744 |         self._index.optimize()
745 |         optimize_time = time.time() - start_time
746 |         
747 |         logger.info(f"Index optimization completed in {optimize_time:.4f}s")
748 |     
749 |     def clear(self) -> None:
750 |         """Clear all vectors from the collection."""
751 |         self._vectors.clear()
752 |         self._metadata.clear()
753 |         self._documents.clear()
754 |         self._next_id = 0
755 |         self._index_built = False
756 |         self._index_needs_rebuild = False
757 |         
758 |         logger.info(f"Collection '{self.name}' cleared")
759 |     
760 |     def __len__(self) -> int:
761 |         """Return number of vectors in the collection."""
762 |         return len(self._vectors)
763 |     
764 |     def __contains__(self, id: int) -> bool:
765 |         """Check if a vector ID exists in the collection."""
766 |         return id in self._vectors
767 | 


--------------------------------------------------------------------------------