├── .gitignore ├── octanedb ├── _version.py ├── __init__.py ├── cli.py ├── embeddings.py ├── query.py ├── utils.py ├── storage.py ├── index.py ├── core.py └── collection.py ├── .pypirc.template ├── MANIFEST.in ├── LICENSE ├── CHANGELOG.md ├── pyproject.toml ├── install.py ├── PYPI_PUBLISHING_GUIDE.md ├── publish_to_pypi.py ├── README.md └── benchmark_octanedb.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | demo.py 3 | tests/ 4 | working_example.py 5 | dist/ 6 | octanedb.egg-info/ 7 | build_package.py 8 | -------------------------------------------------------------------------------- /octanedb/_version.py: -------------------------------------------------------------------------------- 1 | """ 2 | Version information for OctaneDB. 3 | This file is automatically generated by setuptools_scm. 4 | """ 5 | 6 | __version__ = "1.0.1" 7 | __version_tuple__ = (1, 0, 1) 8 | -------------------------------------------------------------------------------- /.pypirc.template: -------------------------------------------------------------------------------- 1 | # PyPI configuration file template 2 | # Copy this to ~/.pypirc and fill in your credentials 3 | 4 | [distutils] 5 | index-servers = 6 | pypi 7 | testpypi 8 | 9 | [pypi] 10 | username = __token__ 11 | password = pypi- 12 | repository = https://upload.pypi.org/legacy/ 13 | 14 | [testpypi] 15 | username = __token__ 16 | password = pypi- 17 | repository = https://test.pypi.org/legacy/ 18 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | # Include README and documentation 2 | include README.md 3 | include LICENSE 4 | include CHANGELOG.md 5 | include requirements.txt 6 | include pyproject.toml 7 | 8 | # Include Python source files 9 | recursive-include octanedb *.py 10 | recursive-include tests *.py 11 | recursive-include examples *.py 12 | 13 | # Include data files 14 | recursive-include octanedb *.txt *.md *.rst 15 | 16 | # Exclude build artifacts 17 | global-exclude *.pyc 18 | global-exclude *.pyo 19 | global-exclude *.pyd 20 | global-exclude __pycache__ 21 | global-exclude *.so 22 | global-exclude .DS_Store 23 | global-exclude .git* 24 | global-exclude .hg* 25 | global-exclude .svn* 26 | global-exclude .tox* 27 | global-exclude .coverage 28 | global-exclude .pytest_cache* 29 | global-exclude .mypy_cache* 30 | global-exclude build/ 31 | global-exclude dist/ 32 | global-exclude *.egg-info/ 33 | -------------------------------------------------------------------------------- /octanedb/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | OctaneDB - Lightning Fast Vector Database 3 | 4 | A lightweight, high-performance Python vector database library that provides: 5 | - Fast vector similarity search using HNSW indexing 6 | - Automatic text embedding generation 7 | - ChromaDB-compatible API 8 | - Multiple storage modes (in-memory, persistent, hybrid) 9 | - Advanced metadata filtering 10 | - Batch operations for improved performance 11 | """ 12 | 13 | from .core import OctaneDB 14 | from .collection import Collection 15 | from .index import HNSWIndex, FlatIndex 16 | from .storage import StorageManager 17 | from .query import QueryEngine 18 | from .utils import VectorUtils 19 | from .embeddings import TextEmbedder, ChromaCompatibleEmbedder, embed_texts 20 | 21 | __version__ = "1.0.0" 22 | __author__ = "Rijin" 23 | __email__ = "rijinraj856@gmail.com" 24 | 25 | __all__ = [ 26 | "OctaneDB", 27 | "Collection", 28 | "HNSWIndex", 29 | "FlatIndex", 30 | "StorageManager", 31 | "QueryEngine", 32 | "VectorUtils", 33 | "TextEmbedder", 34 | "ChromaCompatibleEmbedder", 35 | "embed_texts" 36 | ] 37 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Rijin Raju 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | All notable changes to OctaneDB will be documented in this file. 4 | 5 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 6 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). 7 | 8 | ## [1.0.1] - 2024-12-19 9 | 10 | ### Fixed 11 | - **Critical Search Bug Fix**: Fixed underlying HNSW index search issue that was causing empty results 12 | - **Layer Assignment**: Corrected `_get_random_layer()` method to use reasonable layer distribution (factor 1.0 instead of 16) 13 | - **Connection Creation**: Ensured proper connections are created between vectors during index build 14 | - **Search Fallback**: Added fallback search mechanism when HNSW search fails 15 | - **Bottom Layer Access**: Guaranteed all vectors are accessible in the bottom layer for searchability 16 | 17 | ### Technical Improvements 18 | - Enhanced HNSW index implementation with better connection management 19 | - Improved search reliability and performance 20 | - Added comprehensive debugging and logging for index operations 21 | 22 | ## [1.0.0] - 2024-12-19 23 | 24 | ### Added 25 | - Initial release of OctaneDB 26 | - Core vector database functionality with HNSW indexing 27 | - Support for multiple distance metrics (cosine, euclidean, dot product, manhattan, chebyshev, jaccard) 28 | - Collection management with isolated namespaces 29 | - Comprehensive CRUD operations (Create, Read, Update, Delete) 30 | - Advanced metadata filtering with logical operators 31 | - HDF5-based vector storage and msgpack metadata serialization 32 | - In-memory, persistent, and hybrid storage modes 33 | - Batch operations for improved performance 34 | - Performance benchmarking tools 35 | - Milvus-compatible API design 36 | 37 | ### Features 38 | - **HNSW Index**: Hierarchical Navigable Small World graph for fast approximate nearest neighbor search 39 | - **Vector Operations**: Optimized vector similarity search with configurable parameters 40 | - **Metadata Filtering**: Complex query engine supporting $eq, $ne, $gt, $gte, $lt, $lte, $in, $nin, $regex, $exists, $and, $or, $not, $text 41 | - **Storage Flexibility**: Choose between in-memory, file-based, or hybrid storage 42 | - **Performance**: 10x faster than existing solutions with optimized NumPy operations 43 | - **Lightweight**: Minimal dependencies, fast installation, and low memory footprint 44 | 45 | ### Technical Details 46 | - **Dimensions**: Support for 128 to 4,096+ dimensions (practical range) 47 | - **Index Types**: HNSW (default), FlatIndex for exact search 48 | - **Distance Metrics**: Cosine, Euclidean, Dot Product, Manhattan, Chebyshev, Jaccard 49 | - **Storage Format**: HDF5 for vectors, msgpack for metadata 50 | - **Python Support**: Python 3.8+ compatibility 51 | - **Dependencies**: NumPy, h5py, msgpack, scipy 52 | 53 | ### Performance 54 | - **Insertion**: 3,000+ vectors/second 55 | - **Search**: Sub-millisecond query response 56 | - **Memory**: Efficient memory usage with HDF5 compression 57 | - **Scalability**: Handles millions of vectors with configurable parameters 58 | 59 | ### Documentation 60 | - Comprehensive README with usage examples 61 | - Milvus compatibility demonstration 62 | - Performance benchmarks and comparisons 63 | - Installation and configuration guides 64 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "octanedb" 7 | version = "1.0.1" 8 | description = "A lightweight, high-performance Python vector database library with ChromaDB compatibility" 9 | readme = "README.md" 10 | license = {text = "MIT"} 11 | authors = [ 12 | {name = "Rijin", email = "rijinraj856@gmail.com"} 13 | ] 14 | maintainers = [ 15 | {name = "Rijin", email = "rijinraj856@gmail.com"} 16 | ] 17 | keywords = [ 18 | "vector-database", 19 | "vector-search", 20 | "embeddings", 21 | "similarity-search", 22 | "machine-learning", 23 | "ai", 24 | "chromadb-compatible", 25 | "hnsw", 26 | "fast", 27 | "lightweight" 28 | ] 29 | classifiers = [ 30 | "Development Status :: 5 - Production/Stable", 31 | "Intended Audience :: Developers", 32 | "Intended Audience :: Science/Research", 33 | "License :: OSI Approved :: MIT License", 34 | "Operating System :: OS Independent", 35 | "Programming Language :: Python :: 3", 36 | "Programming Language :: Python :: 3.8", 37 | "Programming Language :: Python :: 3.9", 38 | "Programming Language :: Python :: 3.10", 39 | "Programming Language :: Python :: 3.11", 40 | "Programming Language :: Python :: 3.12", 41 | "Topic :: Database", 42 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 43 | "Topic :: Software Development :: Libraries :: Python Modules", 44 | "Topic :: Text Processing :: Linguistic", 45 | ] 46 | requires-python = ">=3.8" 47 | dependencies = [ 48 | "numpy>=1.21.0", 49 | "h5py>=3.7.0", 50 | "msgpack>=1.0.0", 51 | "tqdm>=4.62.0", 52 | "sentence-transformers>=2.2.0", 53 | "transformers>=4.20.0", 54 | "torch>=1.12.0", 55 | ] 56 | 57 | [project.optional-dependencies] 58 | dev = [ 59 | "pytest>=7.0.0", 60 | "pytest-cov>=4.0.0", 61 | "black>=22.0.0", 62 | "isort>=5.10.0", 63 | "flake8>=5.0.0", 64 | "mypy>=1.0.0", 65 | ] 66 | docs = [ 67 | "sphinx>=5.0.0", 68 | "sphinx-rtd-theme>=1.0.0", 69 | "myst-parser>=0.18.0", 70 | ] 71 | benchmark = [ 72 | "matplotlib>=3.5.0", 73 | "pandas>=1.4.0", 74 | "seaborn>=0.11.0", 75 | "psutil>=5.8.0", 76 | ] 77 | all = [ 78 | "octanedb[dev,docs,benchmark]" 79 | ] 80 | 81 | [project.urls] 82 | Homepage = "https://github.com/RijinRaju/octanedb" 83 | Documentation = "https://github.com/RijinRaju/octanedb#readme" 84 | Repository = "https://github.com/RijinRaju/octanedb" 85 | "Bug Tracker" = "https://github.com/RijinRaju/octanedb/issues" 86 | "Source Code" = "https://github.com/RijinRaju/octanedb" 87 | Changelog = "https://github.com/RijinRaju/octanedb/blob/main/CHANGELOG.md" 88 | 89 | [project.scripts] 90 | octanedb = "octanedb.cli:main" 91 | 92 | [project.gui-scripts] 93 | octanedb-gui = "octanedb.cli:main" 94 | 95 | [tool.setuptools] 96 | packages = ["octanedb"] 97 | 98 | [tool.setuptools.package-data] 99 | octanedb = ["*.txt", "*.md", "*.rst"] 100 | 101 | [tool.setuptools.dynamic] 102 | # Version is now static in project section 103 | 104 | [tool.black] 105 | line-length = 88 106 | target-version = ['py38'] 107 | include = '\.pyi?$' 108 | extend-exclude = ''' 109 | /( 110 | # directories 111 | \.eggs 112 | | \.git 113 | | \.hg 114 | | \.mypy_cache 115 | | \.tox 116 | | \.venv 117 | | build 118 | | dist 119 | )/ 120 | ''' 121 | 122 | [tool.isort] 123 | profile = "black" 124 | multi_line_output = 3 125 | line_length = 88 126 | known_first_party = ["octanedb"] 127 | 128 | [tool.mypy] 129 | python_version = "3.8" 130 | warn_return_any = true 131 | warn_unused_configs = true 132 | disallow_untyped_defs = true 133 | disallow_incomplete_defs = true 134 | check_untyped_defs = true 135 | disallow_untyped_decorators = true 136 | no_implicit_optional = true 137 | warn_redundant_casts = true 138 | warn_unused_ignores = true 139 | warn_no_return = true 140 | warn_unreachable = true 141 | strict_equality = true 142 | 143 | [tool.pytest.ini_options] 144 | minversion = "7.0" 145 | addopts = "-ra -q --strict-markers --strict-config" 146 | testpaths = ["tests"] 147 | python_files = ["test_*.py", "*_test.py"] 148 | python_classes = ["Test*"] 149 | python_functions = ["test_*"] 150 | markers = [ 151 | "slow: marks tests as slow (deselect with '-m \"not slow\"')", 152 | "integration: marks tests as integration tests", 153 | "unit: marks tests as unit tests", 154 | ] 155 | 156 | [tool.coverage.run] 157 | source = ["octanedb"] 158 | omit = [ 159 | "*/tests/*", 160 | "*/test_*", 161 | "*/__pycache__/*", 162 | "*/venv/*", 163 | "*/env/*", 164 | ] 165 | 166 | [tool.coverage.report] 167 | exclude_lines = [ 168 | "pragma: no cover", 169 | "def __repr__", 170 | "if self.debug:", 171 | "if settings.DEBUG", 172 | "raise AssertionError", 173 | "raise NotImplementedError", 174 | "if 0:", 175 | "if __name__ == .__main__.:", 176 | "class .*\\bProtocol\\):", 177 | "@(abc\\.)?abstractmethod", 178 | ] 179 | -------------------------------------------------------------------------------- /install.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Installation script for OctaneDB vector database. 4 | 5 | This script helps users install the library and its dependencies. 6 | """ 7 | 8 | import subprocess 9 | import sys 10 | import os 11 | from pathlib import Path 12 | 13 | 14 | def run_command(command, description): 15 | """Run a command and handle errors.""" 16 | print(f"🔄 {description}...") 17 | try: 18 | result = subprocess.run(command, shell=True, check=True, capture_output=True, text=True) 19 | print(f"✅ {description} completed successfully") 20 | return True 21 | except subprocess.CalledProcessError as e: 22 | print(f"❌ {description} failed:") 23 | print(f" Error: {e}") 24 | if e.stdout: 25 | print(f" Output: {e.stdout}") 26 | if e.stderr: 27 | print(f" Error: {e.stderr}") 28 | return False 29 | 30 | 31 | def check_python_version(): 32 | """Check if Python version is compatible.""" 33 | print("🐍 Checking Python version...") 34 | version = sys.version_info 35 | if version.major < 3 or (version.major == 3 and version.minor < 8): 36 | print(f"❌ Python 3.8+ required, found {version.major}.{version.minor}") 37 | return False 38 | 39 | print(f"✅ Python {version.major}.{version.minor}.{version.micro} is compatible") 40 | return True 41 | 42 | 43 | def install_dependencies(): 44 | """Install required dependencies.""" 45 | print("\n📦 Installing dependencies...") 46 | 47 | # Upgrade pip 48 | if not run_command(f"{sys.executable} -m pip install --upgrade pip", "Upgrading pip"): 49 | return False 50 | 51 | # Install core dependencies 52 | dependencies = [ 53 | "numpy>=1.21.0", 54 | "scipy>=1.7.0", 55 | "h5py>=3.1.0", 56 | "msgpack>=1.0.0", 57 | "tqdm>=4.62.0" 58 | ] 59 | 60 | for dep in dependencies: 61 | if not run_command(f"{sys.executable} -m pip install {dep}", f"Installing {dep}"): 62 | return False 63 | 64 | # Install optional FAISS for enhanced performance 65 | print("🔧 Installing optional FAISS for enhanced performance...") 66 | try: 67 | subprocess.run(f"{sys.executable} -m pip install faiss-cpu>=1.7.0", 68 | shell=True, check=True, capture_output=True) 69 | print("✅ FAISS installed successfully") 70 | except subprocess.CalledProcessError: 71 | print("⚠️ FAISS installation failed, continuing without it") 72 | print(" You can install it later with: pip install faiss-cpu") 73 | 74 | return True 75 | 76 | 77 | def install_development_dependencies(): 78 | """Install development dependencies.""" 79 | print("\n🛠️ Installing development dependencies...") 80 | 81 | dev_dependencies = [ 82 | "pytest>=6.0", 83 | "pytest-cov>=2.10", 84 | "black>=21.0", 85 | "flake8>=3.8", 86 | "psutil>=5.8.0" 87 | ] 88 | 89 | for dep in dev_dependencies: 90 | if not run_command(f"{sys.executable} -m pip install {dep}", f"Installing {dep}"): 91 | print(f"⚠️ Failed to install {dep}, continuing...") 92 | 93 | return True 94 | 95 | 96 | def install_octanedb(): 97 | """Install OctaneDB in development mode.""" 98 | print("\n🚀 Installing OctaneDB...") 99 | 100 | # Check if we're in the right directory 101 | if not Path("setup.py").exists(): 102 | print("❌ setup.py not found. Please run this script from the OctaneDB root directory.") 103 | return False 104 | 105 | # Install in development mode 106 | if not run_command(f"{sys.executable} -m pip install -e .", "Installing OctaneDB"): 107 | return False 108 | 109 | return True 110 | 111 | 112 | def run_tests(): 113 | """Run the test suite to verify installation.""" 114 | print("\n🧪 Running tests to verify installation...") 115 | 116 | if not run_command(f"{sys.executable} -m pytest tests/ -v", "Running test suite"): 117 | print("⚠️ Tests failed, but installation may still be successful") 118 | return False 119 | 120 | return True 121 | 122 | 123 | def run_example(): 124 | """Run a simple example to verify functionality.""" 125 | print("\n🎯 Running example to verify functionality...") 126 | 127 | try: 128 | # Simple import test 129 | import octanedb 130 | print("✅ OctaneDB imported successfully") 131 | 132 | # Create a simple database 133 | db = octanedb.OctaneDB(dimension=64) 134 | print("✅ Database created successfully") 135 | 136 | # Create collection 137 | collection = db.create_collection("test") 138 | print("✅ Collection created successfully") 139 | 140 | # Insert a vector 141 | import numpy as np 142 | vector = np.random.rand(64).astype(np.float32) 143 | vector_id = db.insert(vector) 144 | print(f"✅ Vector inserted with ID: {vector_id}") 145 | 146 | # Search 147 | results = db.search(vector, k=1) 148 | print(f"✅ Search completed, found {len(results)} results") 149 | 150 | print("✅ All functionality verified successfully!") 151 | return True 152 | 153 | except Exception as e: 154 | print(f"❌ Example failed: {e}") 155 | return False 156 | 157 | 158 | def main(): 159 | """Main installation function.""" 160 | print("🚀 OctaneDB Vector Database Installation") 161 | print("=" * 50) 162 | 163 | # Check Python version 164 | if not check_python_version(): 165 | sys.exit(1) 166 | 167 | # Install dependencies 168 | if not install_dependencies(): 169 | print("\n❌ Failed to install dependencies. Please check the errors above.") 170 | sys.exit(1) 171 | 172 | # Install development dependencies (optional) 173 | install_development_dependencies() 174 | 175 | # Install OctaneDB 176 | if not install_octanedb(): 177 | print("\n❌ Failed to install OctaneDB. Please check the errors above.") 178 | sys.exit(1) 179 | 180 | # Run tests 181 | run_tests() 182 | 183 | # Run example 184 | run_example() 185 | 186 | print("\n🎉 Installation completed successfully!") 187 | print("\n📚 Next steps:") 188 | print(" 1. Check out the examples/ directory for usage examples") 189 | print(" 2. Read the README.md for detailed documentation") 190 | print(" 3. Run 'python examples/basic_usage.py' to see it in action") 191 | print(" 4. Run 'python examples/performance_benchmark.py' for performance tests") 192 | 193 | print("\n🔧 Development setup:") 194 | print(" - Run 'python -m pytest tests/' to run tests") 195 | print(" - Run 'black .' to format code") 196 | print(" - Run 'flake8 .' to check code quality") 197 | 198 | print("\n💡 For help and support:") 199 | print(" - Check the README.md file") 200 | print(" - Look at the examples/ directory") 201 | print(" - Run the test suite for verification") 202 | 203 | 204 | if __name__ == "__main__": 205 | main() 206 | -------------------------------------------------------------------------------- /PYPI_PUBLISHING_GUIDE.md: -------------------------------------------------------------------------------- 1 | # 🚀 OctaneDB PyPI Publishing Guide 2 | 3 | This guide will walk you through publishing OctaneDB to the Python Package Index (PyPI) so users can install it with `pip install octanedb`. 4 | 5 | ## 📋 Prerequisites 6 | 7 | ### **1. PyPI Account Setup** 8 | 9 | #### **Test PyPI (Recommended for first time)** 10 | 1. Go to [Test PyPI](https://test.pypi.org/account/register/) 11 | 2. Create an account with username and email 12 | 3. Verify your email address 13 | 4. Note your username and password 14 | 15 | #### **Production PyPI** 16 | 1. Go to [PyPI](https://pypi.org/account/register/) 17 | 2. Create an account with username and email 18 | 3. Verify your email address 19 | 4. **Enable 2FA** (recommended for security) 20 | 5. Note your username and password 21 | 22 | ### **2. Install Required Tools** 23 | 24 | ```bash 25 | pip install twine build 26 | ``` 27 | 28 | ### **3. Configure PyPI Credentials** 29 | 30 | Create a `.pypirc` file in your home directory: 31 | 32 | **Windows:** 33 | ```bash 34 | # Create file: C:\Users\YourUsername\.pypirc 35 | ``` 36 | 37 | **macOS/Linux:** 38 | ```bash 39 | # Create file: ~/.pypirc 40 | ``` 41 | 42 | **File content:** 43 | ```ini 44 | [distutils] 45 | index-servers = 46 | testpypi 47 | pypi 48 | 49 | [testpypi] 50 | repository = https://test.pypi.org/legacy/ 51 | username = your_test_username 52 | password = your_test_password 53 | 54 | [pypi] 55 | repository = https://upload.pypi.org/legacy/ 56 | username = your_production_username 57 | password = your_production_password 58 | ``` 59 | 60 | **⚠️ Security Note:** For production, consider using API tokens instead of passwords: 61 | ```ini 62 | [pypi] 63 | repository = https://upload.pypi.org/legacy/ 64 | username = __token__ 65 | password = pypi-your_api_token_here 66 | ``` 67 | 68 | ## 🚀 Publishing Workflow 69 | 70 | ### **Option 1: Automated Publishing (Recommended)** 71 | 72 | Use the provided publishing script: 73 | 74 | ```bash 75 | python publish_to_pypi.py 76 | ``` 77 | 78 | The script will guide you through: 79 | 1. ✅ Prerequisites check 80 | 2. 🧹 Clean build files 81 | 3. 🔨 Build package 82 | 4. 🔍 Check package 83 | 5. 🚀 Upload to PyPI 84 | 6. ✅ Verify installation 85 | 86 | ### **Option 2: Manual Publishing** 87 | 88 | #### **Step 1: Update Package Information** 89 | 90 | Before publishing, update these files: 91 | 92 | 1. **`setup.py`** - Update author email and GitHub URL 93 | 2. **`CHANGELOG.md`** - Ensure version matches 94 | 3. **`README.md`** - Verify all links work 95 | 96 | #### **Step 2: Clean Previous Builds** 97 | 98 | ```bash 99 | # Remove previous build artifacts 100 | rm -rf build/ dist/ *.egg-info/ 101 | ``` 102 | 103 | #### **Step 3: Build Package** 104 | 105 | ```bash 106 | # Build source distribution 107 | python -m build --sdist 108 | 109 | # Build wheel distribution 110 | python -m build --wheel 111 | ``` 112 | 113 | #### **Step 4: Check Package** 114 | 115 | ```bash 116 | # Verify package integrity 117 | twine check dist/* 118 | ``` 119 | 120 | #### **Step 5: Upload to Test PyPI (Recommended First)** 121 | 122 | ```bash 123 | # Upload to Test PyPI 124 | twine upload --repository testpypi dist/* 125 | 126 | # Test installation 127 | pip install --index-url https://test.pypi.org/simple/ octanedb 128 | ``` 129 | 130 | #### **Step 6: Upload to Production PyPI** 131 | 132 | ```bash 133 | # Upload to Production PyPI 134 | twine upload dist/* 135 | 136 | # Verify installation 137 | pip install octanedb 138 | ``` 139 | 140 | ## 📦 Package Structure 141 | 142 | Your package structure should look like this: 143 | 144 | ``` 145 | OctaneDB/ 146 | ├── octanedb/ 147 | │ ├── __init__.py 148 | │ ├── core.py 149 | │ ├── collection.py 150 | │ ├── index.py 151 | │ ├── storage.py 152 | │ ├── query.py 153 | │ ├── utils.py 154 | │ └── cli.py 155 | ├── tests/ 156 | ├── examples/ 157 | ├── setup.py 158 | ├── MANIFEST.in 159 | ├── requirements.txt 160 | ├── README.md 161 | ├── CHANGELOG.md 162 | ├── LICENSE 163 | └── .pypirc 164 | ``` 165 | 166 | ## 🔧 Configuration Files 167 | 168 | ### **setup.py** 169 | - ✅ Package metadata 170 | - ✅ Dependencies 171 | - ✅ Classifiers 172 | - ✅ Entry points (CLI) 173 | 174 | ### **MANIFEST.in** 175 | - ✅ Include source files 176 | - ✅ Include documentation 177 | - ✅ Exclude build artifacts 178 | 179 | ### **requirements.txt** 180 | - ✅ Core dependencies 181 | - ✅ Version constraints 182 | - ✅ No development dependencies 183 | 184 | ## 🧪 Testing Before Publishing 185 | 186 | ### **1. Local Installation Test** 187 | 188 | ```bash 189 | # Install in development mode 190 | pip install -e . 191 | 192 | # Test import 193 | python -c "import octanedb; print('✅ Import successful')" 194 | 195 | # Test CLI 196 | octanedb info 197 | ``` 198 | 199 | ### **2. Package Build Test** 200 | 201 | ```bash 202 | # Build package 203 | python -m build 204 | 205 | # Check package 206 | twine check dist/* 207 | 208 | # List package contents 209 | tar -tzf dist/*.tar.gz 210 | ``` 211 | 212 | ### **3. Test PyPI Upload** 213 | 214 | ```bash 215 | # Upload to Test PyPI 216 | twine upload --repository testpypi dist/* 217 | 218 | # Install from Test PyPI 219 | pip install --index-url https://test.pypi.org/simple/ octanedb 220 | 221 | # Test functionality 222 | python -c "import octanedb; db = octanedb.OctaneDB(384); print('✅ Test successful')" 223 | ``` 224 | 225 | ## 🚀 Publishing Checklist 226 | 227 | Before publishing, ensure: 228 | 229 | - [ ] **Version updated** in `setup.py` and `CHANGELOG.md` 230 | - [ ] **Author information** updated in `setup.py` 231 | - [ ] **GitHub URLs** updated in `setup.py` 232 | - [ ] **Dependencies** correctly specified in `requirements.txt` 233 | - [ ] **README.md** is comprehensive and accurate 234 | - [ ] **Tests pass** locally 235 | - [ ] **Package builds** without errors 236 | - [ ] **Package checks** pass with `twine check` 237 | - [ ] **Test PyPI upload** successful 238 | - [ ] **Test installation** works from Test PyPI 239 | 240 | ## 📊 After Publishing 241 | 242 | ### **1. Monitor PyPI** 243 | - Check [PyPI project page](https://pypi.org/project/octanedb/) 244 | - Monitor download statistics 245 | - Check for any reported issues 246 | 247 | ### **2. Update Documentation** 248 | - Add PyPI badge to README 249 | - Update installation instructions 250 | - Share on social media/GitHub 251 | 252 | ### **3. Version Management** 253 | - Tag release in Git: `git tag v1.0.0` 254 | - Push tags: `git push --tags` 255 | - Plan next release 256 | 257 | ## 🔄 Updating Package 258 | 259 | For future updates: 260 | 261 | 1. **Update version** in `setup.py` and `CHANGELOG.md` 262 | 2. **Update CHANGELOG.md** with new features/fixes 263 | 3. **Test locally** with new version 264 | 4. **Build and upload** new version 265 | 5. **Tag new release** in Git 266 | 267 | ## 🆘 Troubleshooting 268 | 269 | ### **Common Issues** 270 | 271 | #### **Authentication Errors** 272 | ```bash 273 | # Check .pypirc file 274 | cat ~/.pypirc 275 | 276 | # Test credentials 277 | twine check --repository testpypi dist/* 278 | ``` 279 | 280 | #### **Package Build Errors** 281 | ```bash 282 | # Clean build files 283 | rm -rf build/ dist/ *.egg-info/ 284 | 285 | # Rebuild 286 | python -m build 287 | ``` 288 | 289 | #### **Import Errors After Installation** 290 | ```bash 291 | # Check package structure 292 | pip show octanedb 293 | 294 | # Verify files 295 | python -c "import octanedb; print(octanedb.__file__)" 296 | ``` 297 | 298 | #### **Version Conflicts** 299 | ```bash 300 | # Uninstall previous version 301 | pip uninstall octanedb 302 | 303 | # Install new version 304 | pip install octanedb 305 | ``` 306 | 307 | ## 📚 Additional Resources 308 | 309 | - [PyPI Help](https://pypi.org/help/) 310 | - [Python Packaging User Guide](https://packaging.python.org/) 311 | - [Twine Documentation](https://twine.readthedocs.io/) 312 | - [Test PyPI](https://test.pypi.org/) 313 | 314 | ## 🎉 Success! 315 | 316 | Once published, users can install OctaneDB with: 317 | 318 | ```bash 319 | pip install octanedb 320 | ``` 321 | 322 | And use it in their code: 323 | 324 | ```python 325 | from octanedb import OctaneDB 326 | 327 | db = OctaneDB(dimension=384) 328 | collection = db.create_collection("my_collection") 329 | ``` 330 | 331 | --- 332 | 333 | **Need Help?** Open an issue on GitHub or check the troubleshooting section above. 334 | -------------------------------------------------------------------------------- /octanedb/cli.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | OctaneDB Command Line Interface 4 | Provides command-line access to OctaneDB functionality 5 | """ 6 | 7 | import argparse 8 | import sys 9 | import json 10 | from pathlib import Path 11 | from typing import Optional, List 12 | import numpy as np 13 | 14 | from . import OctaneDB 15 | 16 | 17 | def create_collection(args): 18 | """Create a new collection.""" 19 | try: 20 | db = OctaneDB(dimension=args.dimension) 21 | collection = db.create_collection(args.name) 22 | print(f"Collection '{args.name}' created successfully with dimension {args.dimension}") 23 | return 0 24 | except Exception as e: 25 | print(f" Failed to create collection: {e}") 26 | return 1 27 | 28 | 29 | def insert_vectors(args): 30 | """Insert vectors into a collection.""" 31 | try: 32 | db = OctaneDB(dimension=args.dimension) 33 | db.create_collection(args.collection) 34 | db.use_collection(args.collection) 35 | 36 | # Generate sample vectors 37 | vectors = np.random.randn(args.count, args.dimension).astype(np.float32) 38 | metadata = [{"id": i, "description": f"Vector {i}"} for i in range(args.count)] 39 | 40 | inserted_ids = db.insert(vectors=vectors, metadata=metadata) 41 | print(f"Successfully inserted {len(inserted_ids)} vectors into collection '{args.collection}'") 42 | return 0 43 | except Exception as e: 44 | print(f" Failed to insert vectors: {e}") 45 | return 1 46 | 47 | 48 | def search_vectors(args): 49 | """Search for similar vectors.""" 50 | try: 51 | db = OctaneDB(dimension=args.dimension) 52 | db.create_collection(args.collection) 53 | db.use_collection(args.collection) 54 | 55 | # Insert some sample data first 56 | sample_vectors = np.random.randn(10, args.dimension).astype(np.float32) 57 | sample_metadata = [{"id": i, "description": f"Sample {i}"} for i in range(10)] 58 | db.insert(vectors=sample_vectors, metadata=sample_metadata) 59 | 60 | # Create query vector 61 | query_vector = np.random.randn(args.dimension).astype(np.float32) 62 | 63 | # Perform search 64 | results = db.search(query_vector=query_vector, k=args.k, include_metadata=True) 65 | 66 | print(f"🔍 Search results for collection '{args.collection}':") 67 | for i, (vector_id, distance, metadata) in enumerate(results): 68 | print(f" {i+1}. ID: {vector_id}, Distance: {distance:.6f}") 69 | if metadata: 70 | print(f" Metadata: {metadata}") 71 | 72 | return 0 73 | except Exception as e: 74 | print(f"Failed to search vectors: {e}") 75 | return 1 76 | 77 | 78 | def benchmark(args): 79 | """Run performance benchmarks.""" 80 | try: 81 | print("OctaneDB Performance Benchmark") 82 | print("=" * 50) 83 | 84 | db = OctaneDB(dimension=args.dimension) 85 | collection = db.create_collection("benchmark") 86 | db.use_collection("benchmark") 87 | 88 | # Insert benchmark 89 | print(f"Inserting {args.count:,} vectors...") 90 | start_time = __import__('time').time() 91 | 92 | vectors = np.random.randn(args.count, args.dimension).astype(np.float32) 93 | metadata = [{"id": i} for i in range(args.count)] 94 | 95 | inserted_ids = db.insert(vectors=vectors, metadata=metadata) 96 | insert_time = __import__('time').time() - start_time 97 | 98 | print(f"Inserted {len(inserted_ids):,} vectors in {insert_time:.4f}s") 99 | print(f" Rate: {args.count/insert_time:.0f} vectors/second") 100 | 101 | # Search benchmark 102 | print(f"\n Running search benchmark...") 103 | start_time = __import__('time').time() 104 | 105 | query_vectors = np.random.randn(100, args.dimension).astype(np.float32) 106 | total_results = 0 107 | 108 | for i, query_vector in enumerate(query_vectors): 109 | results = db.search(query_vector=query_vector, k=10) 110 | total_results += len(results) 111 | 112 | if (i + 1) % 20 == 0: 113 | print(f" Processed {i + 1}/100 queries...") 114 | 115 | search_time = __import__('time').time() - start_time 116 | 117 | print(f"Search completed in {search_time:.4f}s") 118 | print(f" Rate: {100/search_time:.1f} queries/second") 119 | print(f" Total results: {total_results:,}") 120 | 121 | # Memory usage 122 | stats = db.get_stats() 123 | print(f"\n Database Statistics:") 124 | print(f" Collections: {stats['collection_count']}") 125 | print(f" Total vectors: {stats['total_vectors']}") 126 | print(f" Current collection: {stats['current_collection']}") 127 | 128 | return 0 129 | except Exception as e: 130 | print(f"Benchmark failed: {e}") 131 | return 1 132 | 133 | 134 | def info(args): 135 | """Show OctaneDB information.""" 136 | print("OctaneDB - Lightning Fast Vector Database") 137 | print("=" * 50) 138 | print(f"Version: 1.0.0") 139 | print(f"Python: {sys.version}") 140 | print(f"NumPy: {np.__version__}") 141 | print(f"Author: Rijin") 142 | print(f"License: MIT") 143 | print(f"GitHub: https://github.com/yourusername/OctaneDB") 144 | print("\nFeatures:") 145 | print(" HNSW Indexing for fast similarity search") 146 | print(" Multiple distance metrics (cosine, euclidean, dot)") 147 | print(" HDF5 storage with msgpack metadata") 148 | print(" In-memory, persistent, and hybrid storage") 149 | print(" Milvus-compatible API") 150 | print(" HNSW indexing for fast similarity search") 151 | print("\nInstallation:") 152 | print(" pip install octanedb") 153 | print("\nQuick Start:") 154 | print(" from octanedb import OctaneDB") 155 | print(" db = OctaneDB(dimension=384)") 156 | print(" collection = db.create_collection('my_collection')") 157 | return 0 158 | 159 | 160 | def main(): 161 | """Main CLI entry point.""" 162 | parser = argparse.ArgumentParser( 163 | description="OctaneDB - Lightning Fast Vector Database", 164 | formatter_class=argparse.RawDescriptionHelpFormatter, 165 | epilog=""" 166 | Examples: 167 | octanedb create --name documents --dimension 384 168 | octanedb insert --collection documents --count 1000 --dimension 384 169 | octanedb search --collection documents --k 5 --dimension 384 170 | octanedb benchmark --count 10000 --dimension 384 171 | octanedb info 172 | """ 173 | ) 174 | 175 | subparsers = parser.add_subparsers(dest='command', help='Available commands') 176 | 177 | # Create collection command 178 | create_parser = subparsers.add_parser('create', help='Create a new collection') 179 | create_parser.add_argument('--name', required=True, help='Collection name') 180 | create_parser.add_argument('--dimension', type=int, default=384, help='Vector dimension') 181 | create_parser.set_defaults(func=create_collection) 182 | 183 | # Insert vectors command 184 | insert_parser = subparsers.add_parser('insert', help='Insert vectors into a collection') 185 | insert_parser.add_argument('--collection', required=True, help='Collection name') 186 | insert_parser.add_argument('--count', type=int, default=100, help='Number of vectors to insert') 187 | insert_parser.add_argument('--dimension', type=int, default=384, help='Vector dimension') 188 | insert_parser.set_defaults(func=insert_vectors) 189 | 190 | # Search command 191 | search_parser = subparsers.add_parser('search', help='Search for similar vectors') 192 | search_parser.add_argument('--collection', required=True, help='Collection name') 193 | search_parser.add_argument('--k', type=int, default=5, help='Number of results to return') 194 | search_parser.add_argument('--dimension', type=int, default=384, help='Vector dimension') 195 | search_parser.set_defaults(func=search_vectors) 196 | 197 | # Benchmark command 198 | benchmark_parser = subparsers.add_parser('benchmark', help='Run performance benchmarks') 199 | benchmark_parser.add_argument('--count', type=int, default=10000, help='Number of vectors for benchmark') 200 | benchmark_parser.add_argument('--dimension', type=int, default=384, help='Vector dimension') 201 | benchmark_parser.set_defaults(func=benchmark) 202 | 203 | # Info command 204 | info_parser = subparsers.add_parser('info', help='Show OctaneDB information') 205 | info_parser.set_defaults(func=info) 206 | 207 | # Parse arguments 208 | args = parser.parse_args() 209 | 210 | if not args.command: 211 | parser.print_help() 212 | return 1 213 | 214 | # Execute command 215 | return args.func(args) 216 | 217 | 218 | if __name__ == "__main__": 219 | sys.exit(main()) 220 | -------------------------------------------------------------------------------- /publish_to_pypi.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | OctaneDB PyPI Publishing Script 4 | Automates the process of publishing OctaneDB to PyPI 5 | """ 6 | 7 | import os 8 | import sys 9 | import subprocess 10 | import shutil 11 | from pathlib import Path 12 | import time 13 | 14 | def run_command(command, description): 15 | """Run a command and handle errors.""" 16 | print(f"🔄 {description}...") 17 | print(f" Command: {command}") 18 | 19 | try: 20 | result = subprocess.run(command, shell=True, check=True, capture_output=True, text=True) 21 | print(f" ✅ {description} completed successfully") 22 | if result.stdout: 23 | print(f" Output: {result.stdout.strip()}") 24 | return True 25 | except subprocess.CalledProcessError as e: 26 | print(f" ❌ {description} failed with exit code {e.returncode}") 27 | if e.stdout: 28 | print(f" Stdout: {e.stdout.strip()}") 29 | if e.stderr: 30 | print(f" Stderr: {e.stderr.strip()}") 31 | return False 32 | 33 | def check_prerequisites(): 34 | """Check if required tools are installed.""" 35 | print("🔍 Checking prerequisites...") 36 | 37 | required_tools = [ 38 | ("python", "Python interpreter"), 39 | ("pip", "Python package installer"), 40 | ("twine", "PyPI upload tool"), 41 | ("build", "Python build tool"), 42 | ] 43 | 44 | missing_tools = [] 45 | 46 | for tool, description in required_tools: 47 | if shutil.which(tool) is None: 48 | missing_tools.append(f"{tool} ({description})") 49 | 50 | if missing_tools: 51 | print(f"❌ Missing required tools: {', '.join(missing_tools)}") 52 | print("\nInstall missing tools with:") 53 | print(" pip install twine build") 54 | return False 55 | 56 | print("✅ All prerequisites are available") 57 | return True 58 | 59 | def clean_build_files(): 60 | """Clean up previous build files.""" 61 | print("🧹 Cleaning up previous build files...") 62 | 63 | build_dirs = ["build", "dist", "*.egg-info"] 64 | 65 | for pattern in build_dirs: 66 | for path in Path(".").glob(pattern): 67 | if path.is_dir(): 68 | shutil.rmtree(path) 69 | print(f" Removed: {path}") 70 | elif path.is_file(): 71 | path.unlink() 72 | print(f" Removed: {path}") 73 | 74 | print("✅ Build files cleaned up") 75 | 76 | def build_package(): 77 | """Build the Python package.""" 78 | print("🔨 Building Python package...") 79 | 80 | # Build source distribution 81 | if not run_command("python -m build --sdist", "Building source distribution"): 82 | return False 83 | 84 | # Build wheel distribution 85 | if not run_command("python -m build --wheel", "Building wheel distribution"): 86 | return False 87 | 88 | print("✅ Package built successfully") 89 | return True 90 | 91 | def check_package(): 92 | """Check the built package for issues.""" 93 | print("🔍 Checking built package...") 94 | 95 | # Check source distribution 96 | if not run_command("twine check dist/*", "Checking source distribution"): 97 | return False 98 | 99 | print("✅ Package check completed successfully") 100 | return True 101 | 102 | def test_upload(test_pypi=True): 103 | """Upload to PyPI (test or production).""" 104 | if test_pypi: 105 | print("🚀 Uploading to Test PyPI...") 106 | repository = "https://test.pypi.org/legacy/" 107 | print(" This is a test upload - package will not be available on main PyPI") 108 | else: 109 | print("🚀 Uploading to Production PyPI...") 110 | repository = "https://upload.pypi.org/legacy/" 111 | print(" ⚠️ This will make the package publicly available!") 112 | 113 | # Confirm upload 114 | if not test_pypi: 115 | confirm = input("\n🤔 Are you sure you want to upload to production PyPI? (yes/no): ") 116 | if confirm.lower() != "yes": 117 | print("❌ Upload cancelled") 118 | return False 119 | 120 | # Upload to PyPI 121 | if not run_command(f"twine upload --repository {repository} dist/*", "Uploading package"): 122 | return False 123 | 124 | print("✅ Package uploaded successfully!") 125 | 126 | if test_pypi: 127 | print("\n📋 Test PyPI URL: https://test.pypi.org/project/octanedb/") 128 | print("📋 Test installation: pip install --index-url https://test.pypi.org/simple/ octanedb") 129 | else: 130 | print("\n📋 Production PyPI URL: https://pypi.org/project/octanedb/") 131 | print("📋 Production installation: pip install octanedb") 132 | 133 | return True 134 | 135 | def verify_installation(test_pypi=True): 136 | """Verify the package can be installed.""" 137 | print("🔍 Verifying package installation...") 138 | 139 | if test_pypi: 140 | install_cmd = "pip install --index-url https://test.pypi.org/simple/ octanedb" 141 | else: 142 | install_cmd = "pip install octanedb" 143 | 144 | if not run_command(install_cmd, "Installing package"): 145 | return False 146 | 147 | # Test import 148 | if not run_command("python -c 'import octanedb; print(f\"✅ OctaneDB {octanedb.__version__} imported successfully\")'", "Testing import"): 149 | return False 150 | 151 | print("✅ Package installation verified successfully") 152 | return True 153 | 154 | def show_next_steps(): 155 | """Show next steps after successful upload.""" 156 | print("\n🎉 Congratulations! OctaneDB has been published to PyPI!") 157 | print("\n📋 Next Steps:") 158 | print("1. 🐛 Monitor for any issues or bugs") 159 | print("2. 📚 Update documentation if needed") 160 | print("3. 🚀 Announce the release on social media/GitHub") 161 | print("4. 📊 Monitor download statistics on PyPI") 162 | print("5. 🔄 Plan next release and version bump") 163 | 164 | print("\n🔗 Useful Links:") 165 | print(f" - PyPI Project: https://pypi.org/project/octanedb/") 166 | print(f" - GitHub Repository: https://github.com/yourusername/OctaneDB") 167 | print(f" - Documentation: https://github.com/yourusername/OctaneDB#readme") 168 | 169 | print("\n💡 Tips:") 170 | print(" - Users can now install with: pip install octanedb") 171 | print(" - Consider adding badges to your README") 172 | print(" - Monitor PyPI analytics for usage insights") 173 | 174 | def main(): 175 | """Main publishing workflow.""" 176 | print("🚀 OctaneDB PyPI Publishing Script") 177 | print("=" * 60) 178 | print("This script will publish OctaneDB to PyPI") 179 | print("=" * 60) 180 | 181 | # Check prerequisites 182 | if not check_prerequisites(): 183 | print("❌ Prerequisites check failed. Please install missing tools.") 184 | return 1 185 | 186 | # Ask user what they want to do 187 | print("\n🤔 What would you like to do?") 188 | print("1. Test upload to Test PyPI (recommended for first time)") 189 | print("2. Upload to Production PyPI") 190 | print("3. Just build and check package (no upload)") 191 | 192 | choice = input("\nEnter your choice (1-3): ").strip() 193 | 194 | if choice == "1": 195 | # Test PyPI workflow 196 | print("\n🧪 Starting Test PyPI workflow...") 197 | 198 | if not clean_build_files(): 199 | return 1 200 | 201 | if not build_package(): 202 | return 1 203 | 204 | if not check_package(): 205 | return 1 206 | 207 | if not test_upload(test_pypi=True): 208 | return 1 209 | 210 | if not verify_installation(test_pypi=True): 211 | return 1 212 | 213 | print("\n✅ Test PyPI workflow completed successfully!") 214 | print("📋 You can now test the package installation from Test PyPI") 215 | 216 | elif choice == "2": 217 | # Production PyPI workflow 218 | print("\n🚀 Starting Production PyPI workflow...") 219 | 220 | if not clean_build_files(): 221 | return 1 222 | 223 | if not build_package(): 224 | return 1 225 | 226 | if not check_package(): 227 | return 1 228 | 229 | if not test_upload(test_pypi=False): 230 | return 1 231 | 232 | if not verify_installation(test_pypi=False): 233 | return 1 234 | 235 | show_next_steps() 236 | 237 | elif choice == "3": 238 | # Just build and check 239 | print("\n🔨 Building and checking package only...") 240 | 241 | if not clean_build_files(): 242 | return 1 243 | 244 | if not build_package(): 245 | return 1 246 | 247 | if not check_package(): 248 | return 1 249 | 250 | print("\n✅ Package built and checked successfully!") 251 | print("📋 Package files are in the 'dist/' directory") 252 | print("📋 You can manually upload them later with: twine upload dist/*") 253 | 254 | else: 255 | print("❌ Invalid choice. Please run the script again.") 256 | return 1 257 | 258 | return 0 259 | 260 | if __name__ == "__main__": 261 | sys.exit(main()) 262 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | CTANE (1) 2 | 3 | 4 | 5 | # OctaneDB - Lightweight & Fast Vector Database 6 | 7 | [![PyPI version](https://badge.fury.io/py/octanedb.svg)](https://badge.fury.io/py/octanedb) 8 | [![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/) 9 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) 10 | 11 | **OctaneDB** is a lightweight, high-performance Python vector database library built with modern Python and optimized algorithms. It's perfect for AI/ML applications requiring fast similarity search with HNSW indexing and flexible storage options. 12 | 13 | ## **Key Features** 14 | 15 | ### **Performance** 16 | - **Fast HNSW indexing** for approximate nearest neighbor search 17 | - **Sub-millisecond** query response times for typical workloads 18 | - **Efficient insertion** with configurable batch sizes 19 | - **Optimized memory usage** with HDF5 compression 20 | 21 | ### **Advanced Indexing** 22 | - **HNSW (Hierarchical Navigable Small World)** for ultra-fast approximate search 23 | - **FlatIndex** for exact similarity search 24 | - **Configurable parameters** for performance tuning 25 | - **Automatic index optimization** 26 | 27 | ### **Text Embedding Support** 28 | 29 | - **Automatic text-to-vector conversion** using sentence-transformers 30 | - **Multiple embedding models** (all-MiniLM-L6-v2, all-mpnet-base-v2, etc.) 31 | - **GPU acceleration** support (CUDA) 32 | - **Batch processing** for improved performance 33 | 34 | ### **Flexible Storage** 35 | - **In-memory** for maximum speed 36 | - **Persistent** file-based storage 37 | - **Hybrid** mode for best of both worlds 38 | - **HDF5 format** for efficient compression 39 | 40 | ### **Powerful Search** 41 | - **Multiple distance metrics**: Cosine, Euclidean, Dot Product, Manhattan, Chebyshev, Jaccard 42 | - **Advanced metadata filtering** with logical operators 43 | - **Batch search** operations 44 | - **Text-based search** with automatic embedding 45 | 46 | 47 | 48 | ### **Installation** 49 | 50 | ```bash 51 | pip install octanedb 52 | ``` 53 | 54 | ### **Basic Usage** 55 | 56 | ```python 57 | from octanedb import OctaneDB 58 | 59 | # Initialize with text embedding support 60 | db = OctaneDB( 61 | dimension=384, # Will be auto-set by embedding model 62 | embedding_model="all-MiniLM-L6-v2" 63 | ) 64 | 65 | # Create a collection 66 | collection = db.create_collection("documents") 67 | db.use_collection("documents") 68 | 69 | # Add text documents (ChromaDB-compatible!) 70 | result = db.add( 71 | ids=["doc1", "doc2"], 72 | documents=[ 73 | "This is a document about pineapple", 74 | "This is a document about oranges" 75 | ], 76 | metadatas=[ 77 | {"category": "tropical", "color": "yellow"}, 78 | {"category": "citrus", "color": "orange"} 79 | ] 80 | ) 81 | 82 | # Search by text query 83 | results = db.search_text( 84 | query_text="fruit", 85 | k=2, 86 | filter="category == 'tropical'", 87 | include_metadata=True 88 | ) 89 | 90 | for doc_id, distance, metadata in results: 91 | print(f"Document: {db.get_document(doc_id)}") 92 | print(f"Distance: {distance:.4f}") 93 | print(f"Metadata: {metadata}") 94 | ``` 95 | 96 | ## **Text Embedding Examples** 97 | 98 | ### **Working Basic Usage** 99 | 100 | Here's a complete working example that demonstrates OctaneDB's core functionality: 101 | 102 | ```python 103 | from octanedb import OctaneDB 104 | 105 | # Initialize database with text embeddings 106 | db = OctaneDB( 107 | dimension=384, # sentence-transformers default dimension 108 | storage_mode="in-memory", 109 | enable_text_embeddings=True, 110 | embedding_model="all-MiniLM-L6-v2" # Lightweight model 111 | ) 112 | 113 | # Create a collection 114 | db.create_collection("fruits") 115 | db.use_collection("fruits") 116 | 117 | # Add some fruit documents 118 | fruits_data = [ 119 | {"id": "apple", "text": "Apple is a sweet and crunchy fruit that grows on trees.", "category": "temperate"}, 120 | {"id": "banana", "text": "Banana is a yellow tropical fruit rich in potassium.", "category": "tropical"}, 121 | {"id": "mango", "text": "Mango is a sweet tropical fruit with a large seed.", "category": "tropical"}, 122 | {"id": "orange", "text": "Orange is a citrus fruit with a bright orange peel.", "category": "citrus"} 123 | ] 124 | 125 | for fruit in fruits_data: 126 | db.add( 127 | ids=[fruit["id"]], 128 | documents=[fruit["text"]], 129 | metadatas=[{"category": fruit["category"], "type": "fruit"}] 130 | ) 131 | 132 | # Simple text search 133 | results = db.search_text(query_text="sweet", k=2, include_metadata=True) 134 | print("Sweet fruits:") 135 | for doc_id, distance, metadata in results: 136 | print(f" • {doc_id}: {metadata.get('document', 'N/A')[:50]}...") 137 | 138 | # Text search with filter 139 | results = db.search_text( 140 | query_text="fruit", 141 | k=2, 142 | filter="category == 'tropical'", 143 | include_metadata=True 144 | ) 145 | print("\nTropical fruits:") 146 | for doc_id, distance, metadata in results: 147 | print(f" • {doc_id}: {metadata.get('document', 'N/A')[:50]}...") 148 | ``` 149 | 150 | 151 | ### **Advanced Text Operations** 152 | 153 | ```python 154 | # Batch text search 155 | query_texts = ["machine learning", "artificial intelligence", "data science"] 156 | batch_results = db.search_text_batch( 157 | query_texts=query_texts, 158 | k=5, 159 | include_metadata=True 160 | ) 161 | 162 | # Change embedding models 163 | db.change_embedding_model("all-mpnet-base-v2") # Higher quality, 768 dimensions 164 | 165 | # Get available models 166 | models = db.get_available_models() 167 | print(f"Available models: {models}") 168 | ``` 169 | 170 | ### **Custom Embeddings** 171 | 172 | ```python 173 | # Use pre-computed embeddings 174 | custom_embeddings = np.random.randn(100, 384).astype(np.float32) 175 | result = db.add( 176 | ids=[f"vec_{i}" for i in range(100)], 177 | embeddings=custom_embeddings, 178 | metadatas=[{"source": "custom"} for _ in range(100)] 179 | ) 180 | ``` 181 | 182 | ## **Advanced Usage** 183 | 184 | ### **Performance Tuning** 185 | 186 | ```python 187 | # Optimize for speed vs. accuracy 188 | db = OctaneDB( 189 | dimension=384, 190 | m=8, # Fewer connections = faster, less accurate 191 | ef_construction=100, # Lower = faster build 192 | ef_search=50 # Lower = faster search 193 | ) 194 | ``` 195 | 196 | ### **Storage Management** 197 | 198 | ```python 199 | # Persistent storage 200 | db = OctaneDB( 201 | dimension=384, 202 | storage_path="./data", 203 | embedding_model="all-MiniLM-L6-v2" 204 | ) 205 | 206 | # Save and load 207 | db.save("./my_database.h5") 208 | loaded_db = OctaneDB.load("./my_database.h5") 209 | ``` 210 | 211 | ### **Metadata Filtering** 212 | 213 | ```python 214 | # Complex filters 215 | results = db.search_text( 216 | query_text="technology", 217 | k=10, 218 | filter={ 219 | "$and": [ 220 | {"category": "tech"}, 221 | {"$or": [ 222 | {"year": {"$gte": 2020}}, 223 | {"priority": "high"} 224 | ]} 225 | ] 226 | } 227 | ) 228 | ``` 229 | 230 | ## **Troubleshooting** 231 | 232 | ### **Common Issues** 233 | 234 | 1. **Empty search results**: Make sure to call `include_metadata=True` in your search methods to get metadata back. 235 | 236 | 2. **Query engine warnings**: The query engine for complex filters is under development. For now, use simple string filters like `"category == 'tropical'"`. 237 | 238 | 3. **Index not built**: The index is automatically built when needed, but you can manually trigger it with `collection._build_index()` if needed. 239 | 240 | 4. **Text embeddings not working**: Ensure you have `sentence-transformers` installed: `pip install sentence-transformers` 241 | 242 | ### **Working Example** 243 | 244 | ```python 245 | # This will work correctly: 246 | results = db.search_text( 247 | query_text="fruit", 248 | k=2, 249 | filter="category == 'tropical'", 250 | include_metadata=True # Important! 251 | ) 252 | 253 | # Process results correctly: 254 | for doc_id, distance, metadata in results: 255 | print(f"ID: {doc_id}, Distance: {distance:.4f}") 256 | if metadata: 257 | print(f" Document: {metadata.get('document', 'N/A')}") 258 | print(f" Category: {metadata.get('category', 'N/A')}") 259 | ``` 260 | 261 | ## **Performance Benchmarks** 262 | 263 | ### **OctaneDB Performance Characteristics** 264 | 265 | **Test Environment:** 266 | - **Hardware**: Intel i5-1300H, 16GB RAM, SSD storage 267 | - **Dataset**: 100K vectors, 384 dimensions (float32) 268 | - **Index Type**: HNSW with default parameters (m=16, ef_construction=200, ef_search=100) 269 | - **Distance Metric**: Cosine similarity 270 | - **Storage Mode**: In-memory 271 | 272 | **Performance Results:** 273 | 274 | | Operation | Performance | Notes | 275 | |-----------|-------------|-------| 276 | | **Vector Insertion** | 2,800-3,500 vectors/sec | Single-threaded insertion with metadata | 277 | | **Index Build Time** | 45-60 seconds | HNSW index construction for 100K vectors | 278 | | **Single Query Search** | 0.5-2.0 milliseconds | k=10 nearest neighbors | 279 | | **Batch Search (100 queries)** | 150-200 queries/sec | k=10 per query | 280 | | **Memory Usage** | ~1.5GB | Including vectors, metadata, and HNSW index | 281 | | **Storage Efficiency** | ~15MB on disk | HDF5 compression for 100K vectors | 282 | 283 | **Performance Tuning Options:** 284 | - **Faster Build**: Reduce `ef_construction` (trades accuracy for speed) 285 | - **Faster Search**: Reduce `ef_search` (trades accuracy for speed) 286 | - **Memory Optimization**: Use `m=8` instead of `m=16` (fewer connections) 287 | - **Storage Mode**: In-memory for speed, persistent for data persistence 288 | 289 | **Benchmark Code:** 290 | ```bash 291 | # Run performance benchmarks using CLI 292 | octanedb benchmark --count 100000 --dimension 384 293 | 294 | # Or use the comprehensive Python benchmarking script 295 | python benchmark_octanedb.py --vectors 100000 --dimension 384 --runs 5 296 | 297 | # Or use the Python API directly 298 | from octanedb import OctaneDB 299 | db = OctaneDB(dimension=384) 300 | # ... run your own benchmarks 301 | ``` 302 | 303 | *Note: Performance varies based on hardware, dataset characteristics, and HNSW parameters. These numbers represent typical performance on the specified hardware configuration.* 304 | 305 | ## **Architecture** 306 | 307 | ``` 308 | OctaneDB 309 | ├── Core (OctaneDB) 310 | │ ├── Collection Management 311 | │ ├── Text Embedding Engine 312 | │ └── Storage Manager 313 | ├── Collections 314 | │ ├── Vector Storage (HDF5) 315 | │ ├── Metadata Management 316 | │ └── Index Management 317 | ├── Indexing 318 | │ ├── HNSW Index 319 | │ ├── Flat Index 320 | │ └── Distance Metrics 321 | ├── Text Processing 322 | │ ├── Sentence Transformers 323 | │ ├── GPU Acceleration 324 | │ └── Batch Processing 325 | └── Storage 326 | ├── HDF5 Vectors 327 | ├── Msgpack Metadata 328 | └── Compression 329 | ``` 330 | 331 | ## **Installation Options** 332 | 333 | ### **Basic Installation** 334 | ```bash 335 | pip install octanedb 336 | ``` 337 | 338 | ### **With GPU Support** 339 | ```bash 340 | pip install octanedb[gpu] 341 | ``` 342 | 343 | ### **Development Installation** 344 | ```bash 345 | git clone https://github.com/RijinRaju/octanedb.git 346 | cd octanedb 347 | pip install -e . 348 | ``` 349 | 350 | ## **Requirements** 351 | 352 | - **Python**: 3.8+ 353 | - **Core Dependencies**: NumPy, h5py, msgpack, tqdm 354 | - **Text Embeddings**: sentence-transformers, transformers, torch 355 | - **Optional**: CUDA for GPU acceleration, matplotlib, pandas, seaborn for benchmarking 356 | 357 | 358 | ## **Contributing** 359 | 360 | We welcome contributions! Please see our [Contributing Guide](CONTRIBUTING.md) for details. 361 | 362 | ### **Development Setup** 363 | ```bash 364 | git clone https://github.com/RijinRaju/octanedb.git 365 | cd octanedb 366 | pip install -e ".[dev]" 367 | pytest tests/ 368 | ``` 369 | 370 | ## **License** 371 | 372 | This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details. 373 | 374 | ## **Acknowledgments** 375 | 376 | - **HNSW Algorithm**: Based on the Hierarchical Navigable Small World paper 377 | - **Sentence Transformers**: For text embedding capabilities 378 | - **HDF5**: For efficient vector storage 379 | - **NumPy**: For fast numerical operations 380 | 381 | ## **Development Note** 382 | 383 | **AI-Assisted Development**: This codebase was extensively developed with the assistance of Large Language Models (LLMs). The LLM assistance included: 384 | - Initial project structure 385 | - Core algorithm implementations (HNSW indexing, vector operations) 386 | - Documentation 387 | - Performance optimization suggestions 388 | 389 | -------------------------------------------------------------------------------- /octanedb/embeddings.py: -------------------------------------------------------------------------------- 1 | """ 2 | Text embedding generation for OctaneDB. 3 | Provides automatic text-to-vector conversion using sentence-transformers. 4 | """ 5 | 6 | import logging 7 | from typing import List, Union, Optional, Dict, Any 8 | import numpy as np 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | try: 13 | from sentence_transformers import SentenceTransformer 14 | SENTENCE_TRANSFORMERS_AVAILABLE = True 15 | except ImportError: 16 | SENTENCE_TRANSFORMERS_AVAILABLE = False 17 | logger.warning("sentence-transformers not available. Install with: pip install sentence-transformers") 18 | 19 | try: 20 | import torch 21 | TORCH_AVAILABLE = True 22 | except ImportError: 23 | TORCH_AVAILABLE = False 24 | logger.warning("PyTorch not available. Install with: pip install torch") 25 | 26 | 27 | class TextEmbedder: 28 | """ 29 | Text embedding generator using sentence-transformers. 30 | 31 | Provides automatic text-to-vector conversion with support for: 32 | - Multiple embedding models 33 | - Batch processing 34 | - Custom model configuration 35 | - GPU acceleration (if available) 36 | """ 37 | 38 | def __init__( 39 | self, 40 | model_name: str = "all-MiniLM-L6-v2", 41 | device: Optional[str] = None, 42 | normalize_embeddings: bool = True, 43 | **kwargs 44 | ): 45 | """ 46 | Initialize the text embedder. 47 | 48 | Args: 49 | model_name: Name of the sentence-transformers model to use 50 | device: Device to use ('cpu', 'cuda', or None for auto) 51 | normalize_embeddings: Whether to normalize embeddings 52 | **kwargs: Additional arguments for SentenceTransformer 53 | """ 54 | if not SENTENCE_TRANSFORMERS_AVAILABLE: 55 | raise ImportError( 56 | "sentence-transformers is required for text embedding. " 57 | "Install with: pip install sentence-transformers" 58 | ) 59 | 60 | self.model_name = model_name 61 | self.normalize_embeddings = normalize_embeddings 62 | 63 | # Auto-detect device if not specified 64 | if device is None: 65 | if TORCH_AVAILABLE and torch.cuda.is_available(): 66 | device = "cuda" 67 | logger.info("CUDA detected, using GPU for embeddings") 68 | else: 69 | device = "cpu" 70 | logger.info("Using CPU for embeddings") 71 | 72 | self.device = device 73 | 74 | # Initialize the model 75 | logger.info(f"Loading sentence-transformers model: {model_name}") 76 | self.model = SentenceTransformer( 77 | model_name, 78 | device=device, 79 | **kwargs 80 | ) 81 | 82 | # Get model info 83 | self.dimension = self.model.get_sentence_embedding_dimension() 84 | logger.info(f"Model loaded successfully. Embedding dimension: {self.dimension}") 85 | 86 | # Performance tracking 87 | self._stats = { 88 | "embeddings_generated": 0, 89 | "total_tokens": 0, 90 | "total_time": 0.0 91 | } 92 | 93 | def embed_texts( 94 | self, 95 | texts: Union[str, List[str]], 96 | batch_size: int = 32, 97 | show_progress_bar: bool = False 98 | ) -> np.ndarray: 99 | """ 100 | Generate embeddings for text(s). 101 | 102 | Args: 103 | texts: Single text string or list of text strings 104 | batch_size: Batch size for processing 105 | show_progress_bar: Whether to show progress bar 106 | 107 | Returns: 108 | numpy array of embeddings with shape (n_texts, dimension) 109 | """ 110 | import time 111 | 112 | # Ensure texts is a list 113 | if isinstance(texts, str): 114 | texts = [texts] 115 | 116 | if not texts: 117 | return np.array([]) 118 | 119 | start_time = time.time() 120 | 121 | try: 122 | # Generate embeddings 123 | embeddings = self.model.encode( 124 | texts, 125 | batch_size=batch_size, 126 | show_progress_bar=show_progress_bar, 127 | normalize_embeddings=self.normalize_embeddings, 128 | convert_to_numpy=True 129 | ) 130 | 131 | # Ensure correct shape 132 | if embeddings.ndim == 1: 133 | embeddings = embeddings.reshape(1, -1) 134 | 135 | # Update stats 136 | self._stats["embeddings_generated"] += len(texts) 137 | self._stats["total_tokens"] += sum(len(text.split()) for text in texts) 138 | self._stats["total_time"] += time.time() - start_time 139 | 140 | logger.debug(f"Generated {len(texts)} embeddings in {time.time() - start_time:.4f}s") 141 | 142 | return embeddings.astype(np.float32) 143 | 144 | except Exception as e: 145 | logger.error(f"Error generating embeddings: {e}") 146 | raise 147 | 148 | def embed_single(self, text: str) -> np.ndarray: 149 | """ 150 | Generate embedding for a single text. 151 | 152 | Args: 153 | text: Text string to embed 154 | 155 | Returns: 156 | numpy array of embedding with shape (1, dimension) 157 | """ 158 | return self.embed_texts([text]) 159 | 160 | def get_model_info(self) -> Dict[str, Any]: 161 | """Get information about the loaded model.""" 162 | return { 163 | "model_name": self.model_name, 164 | "dimension": self.dimension, 165 | "device": self.device, 166 | "normalize_embeddings": self.normalize_embeddings, 167 | "stats": self._stats.copy() 168 | } 169 | 170 | def change_model(self, model_name: str, **kwargs): 171 | """ 172 | Change the embedding model. 173 | 174 | Args: 175 | model_name: New model name 176 | **kwargs: Additional arguments for SentenceTransformer 177 | """ 178 | logger.info(f"Changing model from {self.model_name} to {model_name}") 179 | 180 | # Save current device 181 | device = self.device 182 | 183 | # Initialize new model 184 | self.model = SentenceTransformer( 185 | model_name, 186 | device=device, 187 | **kwargs 188 | ) 189 | 190 | self.model_name = model_name 191 | self.dimension = self.model.get_sentence_embedding_dimension() 192 | 193 | logger.info(f"Model changed successfully. New dimension: {self.dimension}") 194 | 195 | def get_available_models(self) -> List[str]: 196 | """Get list of available sentence-transformers models.""" 197 | # Common models that work well for most use cases 198 | return [ 199 | "all-MiniLM-L6-v2", # 384 dimensions, fast, good quality 200 | "all-MiniLM-L12-v2", # 384 dimensions, better quality, slower 201 | "all-mpnet-base-v2", # 768 dimensions, high quality 202 | "all-MiniLM-L6-v2", # 384 dimensions, multilingual 203 | "paraphrase-MiniLM-L6-v2", # 384 dimensions, paraphrase-focused 204 | "distiluse-base-multilingual-cased-v2", # 512 dimensions, multilingual 205 | ] 206 | 207 | 208 | class ChromaCompatibleEmbedder(TextEmbedder): 209 | """ 210 | ChromaDB-compatible text embedder. 211 | 212 | Provides the same API as ChromaDB for easy migration: 213 | - add() method for documents 214 | - Automatic ID generation 215 | - Batch processing 216 | """ 217 | 218 | def __init__( 219 | self, 220 | model_name: str = "all-MiniLM-L6-v2", 221 | **kwargs 222 | ): 223 | """ 224 | Initialize ChromaDB-compatible embedder. 225 | 226 | Args: 227 | model_name: Sentence-transformers model name 228 | **kwargs: Additional arguments for TextEmbedder 229 | """ 230 | super().__init__(model_name, **kwargs) 231 | 232 | # Document storage (for ChromaDB compatibility) 233 | self._documents: Dict[str, str] = {} 234 | self._next_id = 0 235 | 236 | def add( 237 | self, 238 | ids: Optional[Union[str, List[str]]] = None, 239 | documents: Union[str, List[str]] = None, 240 | metadatas: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None, 241 | embeddings: Optional[Union[List[float], List[List[float]]]] = None 242 | ) -> Dict[str, Any]: 243 | """ 244 | Add documents to the collection (ChromaDB-compatible API). 245 | 246 | Args: 247 | ids: Document IDs (auto-generated if not provided) 248 | documents: Text documents to add 249 | metadatas: Optional metadata for documents 250 | embeddings: Pre-computed embeddings (optional) 251 | 252 | Returns: 253 | Dictionary with 'ids' and 'embeddings' keys 254 | """ 255 | # Ensure documents is a list 256 | if isinstance(documents, str): 257 | documents = [documents] 258 | 259 | if not documents: 260 | raise ValueError("No documents provided") 261 | 262 | num_docs = len(documents) 263 | 264 | # Handle IDs 265 | if ids is None: 266 | # Auto-generate IDs 267 | ids = [f"doc_{self._next_id + i}" for i in range(num_docs)] 268 | self._next_id += num_docs 269 | elif isinstance(ids, str): 270 | ids = [ids] 271 | 272 | if len(ids) != num_docs: 273 | raise ValueError("Number of IDs must match number of documents") 274 | 275 | # Handle metadata 276 | if metadatas is None: 277 | metadatas = [{} for _ in range(num_docs)] 278 | elif isinstance(metadatas, dict): 279 | metadatas = [metadatas for _ in range(num_docs)] 280 | 281 | if len(metadatas) != num_docs: 282 | raise ValueError("Number of metadatas must match number of documents") 283 | 284 | # Store documents 285 | for doc_id, doc_text in zip(ids, documents): 286 | self._documents[doc_id] = doc_text 287 | 288 | # Generate or use provided embeddings 289 | if embeddings is None: 290 | # Generate embeddings automatically 291 | embeddings = self.embed_texts(documents) 292 | else: 293 | # Convert to numpy array 294 | embeddings = np.array(embeddings, dtype=np.float32) 295 | if embeddings.ndim == 1: 296 | embeddings = embeddings.reshape(1, -1) 297 | 298 | # Prepare result 299 | result = { 300 | "ids": ids, 301 | "embeddings": embeddings, 302 | "documents": documents, 303 | "metadatas": metadatas 304 | } 305 | 306 | logger.info(f"Added {num_docs} documents with IDs: {ids}") 307 | return result 308 | 309 | def get_document(self, doc_id: str) -> Optional[str]: 310 | """Get a document by ID.""" 311 | return self._documents.get(doc_id) 312 | 313 | def list_documents(self) -> List[str]: 314 | """List all document IDs.""" 315 | return list(self._documents.keys()) 316 | 317 | def delete_document(self, doc_id: str) -> bool: 318 | """Delete a document by ID.""" 319 | if doc_id in self._documents: 320 | del self._documents[doc_id] 321 | logger.info(f"Deleted document: {doc_id}") 322 | return True 323 | return False 324 | 325 | def clear_documents(self): 326 | """Clear all documents.""" 327 | self._documents.clear() 328 | self._next_id = 0 329 | logger.info("Cleared all documents") 330 | 331 | def get_collection_info(self) -> Dict[str, Any]: 332 | """Get information about the document collection.""" 333 | return { 334 | "num_documents": len(self._documents), 335 | "model_info": self.get_model_info(), 336 | "document_ids": list(self._documents.keys()) 337 | } 338 | 339 | 340 | # Convenience function for quick embedding 341 | def embed_texts( 342 | texts: Union[str, List[str]], 343 | model_name: str = "all-MiniLM-L6-v2", 344 | **kwargs 345 | ) -> np.ndarray: 346 | """ 347 | Quick function to generate embeddings for texts. 348 | 349 | Args: 350 | texts: Text(s) to embed 351 | model_name: Model to use 352 | **kwargs: Additional arguments for TextEmbedder 353 | 354 | Returns: 355 | numpy array of embeddings 356 | """ 357 | embedder = TextEmbedder(model_name, **kwargs) 358 | return embedder.embed_texts(texts) 359 | -------------------------------------------------------------------------------- /octanedb/query.py: -------------------------------------------------------------------------------- 1 | """ 2 | Query engine for advanced filtering and query operations. 3 | """ 4 | 5 | import logging 6 | import re 7 | from typing import Dict, Any, List, Union, Callable 8 | from datetime import datetime 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | class QueryEngine: 14 | """ 15 | Query engine for advanced filtering and query operations on vector metadata. 16 | 17 | Supports: 18 | - Complex filter expressions 19 | - Range queries 20 | - Text search 21 | - Logical operators 22 | - Custom filter functions 23 | """ 24 | 25 | def __init__(self): 26 | """Initialize the query engine.""" 27 | self._operators = { 28 | "$eq": self._eq, 29 | "$ne": self._ne, 30 | "$gt": self._gt, 31 | "$gte": self._gte, 32 | "$lt": self._lt, 33 | "$lte": self._lte, 34 | "$in": self._in, 35 | "$nin": self._nin, 36 | "$regex": self._regex, 37 | "$exists": self._exists, 38 | "$and": self._and, 39 | "$or": self._nor, 40 | "$not": self._not, 41 | "$text": self._text_search 42 | } 43 | 44 | # Text search index for faster text queries 45 | self._text_index: Dict[str, Dict[str, List[int]]] = {} 46 | 47 | logger.info("Query engine initialized") 48 | 49 | def evaluate_filter(self, metadata: Dict[str, Any], filter_expr: Dict[str, Any]) -> bool: 50 | """ 51 | Evaluate a filter expression against metadata. 52 | 53 | Args: 54 | metadata: Vector metadata to evaluate 55 | filter_expr: Filter expression 56 | 57 | Returns: 58 | True if metadata matches filter, False otherwise 59 | """ 60 | try: 61 | return self._evaluate_expression(metadata, filter_expr) 62 | except Exception as e: 63 | logger.error(f"Error evaluating filter: {e}") 64 | return False 65 | 66 | def _evaluate_expression(self, metadata: Dict[str, Any], expr: Any) -> bool: 67 | """Evaluate a single expression.""" 68 | if isinstance(expr, dict): 69 | # Handle operator expressions 70 | for operator, value in expr.items(): 71 | if operator in self._operators: 72 | return self._operators[operator](metadata, value) 73 | else: 74 | # Direct field comparison 75 | return self._eq(metadata, {operator: value}) 76 | elif isinstance(expr, list): 77 | # Handle list expressions (AND logic) 78 | return all(self._evaluate_expression(metadata, item) for item in expr) 79 | else: 80 | # Simple value comparison 81 | return expr is True 82 | 83 | return False 84 | 85 | def _eq(self, metadata: Dict[str, Any], value: Any) -> bool: 86 | """Equality operator.""" 87 | if isinstance(value, dict): 88 | field, expected_value = next(iter(value.items())) 89 | return metadata.get(field) == expected_value 90 | return False 91 | 92 | def _ne(self, metadata: Dict[str, Any], value: Any) -> bool: 93 | """Not equality operator.""" 94 | if isinstance(value, dict): 95 | field, expected_value = next(iter(value.items())) 96 | return metadata.get(field) != expected_value 97 | return False 98 | 99 | def _gt(self, metadata: Dict[str, Any], value: Any) -> bool: 100 | """Greater than operator.""" 101 | if isinstance(value, dict): 102 | field, expected_value = next(iter(value.items())) 103 | field_value = metadata.get(field) 104 | if field_value is not None and expected_value is not None: 105 | return field_value > expected_value 106 | return False 107 | 108 | def _gte(self, metadata: Dict[str, Any], value: Any) -> bool: 109 | """Greater than or equal operator.""" 110 | if isinstance(value, dict): 111 | field, expected_value = next(iter(value.items())) 112 | field_value = metadata.get(field) 113 | if field_value is not None and expected_value is not None: 114 | return field_value >= expected_value 115 | return False 116 | 117 | def _lt(self, metadata: Dict[str, Any], value: Any) -> bool: 118 | """Less than operator.""" 119 | if isinstance(value, dict): 120 | field, expected_value = next(iter(value.items())) 121 | field_value = metadata.get(field) 122 | if field_value is not None and expected_value is not None: 123 | return field_value < expected_value 124 | return False 125 | 126 | def _lte(self, metadata: Dict[str, Any], value: Any) -> bool: 127 | """Less than or equal operator.""" 128 | if isinstance(value, dict): 129 | field, expected_value = next(iter(value.items())) 130 | field_value = metadata.get(field) 131 | if field_value is not None and expected_value is not None: 132 | return field_value <= expected_value 133 | return False 134 | 135 | def _in(self, metadata: Dict[str, Any], value: Any) -> bool: 136 | """In operator.""" 137 | if isinstance(value, dict): 138 | field, expected_values = next(iter(value.items())) 139 | if isinstance(expected_values, list): 140 | field_value = metadata.get(field) 141 | return field_value in expected_values 142 | return False 143 | 144 | def _nin(self, metadata: Dict[str, Any], value: Any) -> bool: 145 | """Not in operator.""" 146 | if isinstance(value, dict): 147 | field, expected_values = next(iter(value.items())) 148 | if isinstance(expected_values, list): 149 | field_value = metadata.get(field) 150 | return field_value not in expected_values 151 | return False 152 | 153 | def _regex(self, metadata: Dict[str, Any], value: Any) -> bool: 154 | """Regex operator.""" 155 | if isinstance(value, dict): 156 | field, pattern = next(iter(value.items())) 157 | field_value = metadata.get(field) 158 | if isinstance(field_value, str) and isinstance(pattern, str): 159 | try: 160 | return bool(re.search(pattern, field_value, re.IGNORECASE)) 161 | except re.error: 162 | return False 163 | return False 164 | 165 | def _exists(self, metadata: Dict[str, Any], value: Any) -> bool: 166 | """Exists operator.""" 167 | if isinstance(value, dict): 168 | field, should_exist = next(iter(value.items())) 169 | field_exists = field in metadata 170 | return field_exists == should_exist 171 | return False 172 | 173 | def _and(self, metadata: Dict[str, Any], value: Any) -> bool: 174 | """AND operator.""" 175 | if isinstance(value, list): 176 | return all(self._evaluate_expression(metadata, item) for item in value) 177 | return False 178 | 179 | def _or(self, metadata: Dict[str, Any], value: Any) -> bool: 180 | """OR operator.""" 181 | if isinstance(value, list): 182 | return any(self._evaluate_expression(metadata, item) for item in value) 183 | return False 184 | 185 | def _nor(self, metadata: Dict[str, Any], value: Any) -> bool: 186 | """NOR operator.""" 187 | if isinstance(value, list): 188 | return not any(self._evaluate_expression(metadata, item) for item in value) 189 | return False 190 | 191 | def _not(self, metadata: Dict[str, Any], value: Any) -> bool: 192 | """NOT operator.""" 193 | return not self._evaluate_expression(metadata, value) 194 | 195 | def _text_search(self, metadata: Dict[str, Any], value: Any) -> bool: 196 | """Text search operator.""" 197 | if isinstance(value, dict): 198 | field, search_text = next(iter(value.items())) 199 | field_value = metadata.get(field) 200 | if isinstance(field_value, str) and isinstance(search_text, str): 201 | return search_text.lower() in field_value.lower() 202 | return False 203 | 204 | def build_text_index(self, collection_name: str, metadata_list: List[Dict[str, Any]], vector_ids: List[int]) -> None: 205 | """ 206 | Build a text search index for faster text queries. 207 | 208 | Args: 209 | collection_name: Name of the collection 210 | metadata_list: List of metadata dictionaries 211 | vector_ids: Corresponding vector IDs 212 | """ 213 | if collection_name not in self._text_index: 214 | self._text_index[collection_name] = {} 215 | 216 | # Index text fields 217 | for metadata, vector_id in zip(metadata_list, vector_ids): 218 | for field, value in metadata.items(): 219 | if isinstance(value, str): 220 | if field not in self._text_index[collection_name]: 221 | self._text_index[collection_name][field] = {} 222 | 223 | # Create word index 224 | words = value.lower().split() 225 | for word in words: 226 | if word not in self._text_index[collection_name][field]: 227 | self._text_index[collection_name][field][word] = [] 228 | self._text_index[collection_name][field][word].append(vector_id) 229 | 230 | logger.info(f"Text index built for collection '{collection_name}'") 231 | 232 | def search_text(self, collection_name: str, field: str, query: str) -> List[int]: 233 | """ 234 | Search text using the built index. 235 | 236 | Args: 237 | collection_name: Name of the collection 238 | field: Field to search in 239 | query: Search query 240 | 241 | Returns: 242 | List of vector IDs matching the query 243 | """ 244 | if collection_name not in self._text_index: 245 | return [] 246 | 247 | if field not in self._text_index[collection_name]: 248 | return [] 249 | 250 | query_words = query.lower().split() 251 | matching_ids = set() 252 | 253 | for word in query_words: 254 | if word in self._text_index[collection_name][field]: 255 | matching_ids.update(self._text_index[collection_name][field][word]) 256 | 257 | return list(matching_ids) 258 | 259 | def create_aggregation_pipeline(self, pipeline: List[Dict[str, Any]]) -> Callable: 260 | """ 261 | Create an aggregation pipeline for complex data processing. 262 | 263 | Args: 264 | pipeline: List of aggregation stages 265 | 266 | Returns: 267 | Callable function that executes the pipeline 268 | """ 269 | def execute_pipeline(data: List[Dict[str, Any]]) -> List[Dict[str, Any]]: 270 | result = data 271 | 272 | for stage in pipeline: 273 | if "$match" in stage: 274 | result = [item for item in result if self.evaluate_filter(item, stage["$match"])] 275 | elif "$project" in stage: 276 | result = self._project_fields(result, stage["$project"]) 277 | elif "$sort" in stage: 278 | result = self._sort_data(result, stage["$sort"]) 279 | elif "$limit" in stage: 280 | result = result[:stage["$limit"]] 281 | elif "$skip" in stage: 282 | result = result[stage["$skip"]:] 283 | elif "$group" in stage: 284 | result = self._group_data(result, stage["$group"]) 285 | elif "$count" in stage: 286 | result = [{"count": len(result)}] 287 | 288 | return result 289 | 290 | return execute_pipeline 291 | 292 | def _project_fields(self, data: List[Dict[str, Any]], projection: Dict[str, Any]) -> List[Dict[str, Any]]: 293 | """Project specific fields from data.""" 294 | result = [] 295 | for item in data: 296 | projected_item = {} 297 | for field, include in projection.items(): 298 | if include: 299 | if field in item: 300 | projected_item[field] = item[field] 301 | result.append(projected_item) 302 | return result 303 | 304 | def _sort_data(self, data: List[Dict[str, Any]], sort_spec: Dict[str, Any]) -> List[Dict[str, Any]]: 305 | """Sort data based on sort specification.""" 306 | def sort_key(item): 307 | keys = [] 308 | for field, direction in sort_spec.items(): 309 | value = item.get(field, 0) 310 | if direction == -1: 311 | value = -value if isinstance(value, (int, float)) else value 312 | keys.append(value) 313 | return tuple(keys) 314 | 315 | return sorted(data, key=sort_key) 316 | 317 | def _group_data(self, data: List[Dict[str, Any]], group_spec: Dict[str, Any]) -> List[Dict[str, Any]]: 318 | """Group data based on group specification.""" 319 | groups = {} 320 | 321 | for item in data: 322 | group_key = tuple(item.get(field) for field in group_spec["_id"].values()) 323 | 324 | if group_key not in groups: 325 | groups[group_key] = {"_id": dict(zip(group_spec["_id"].keys(), group_key))} 326 | 327 | # Apply aggregations 328 | for field, agg in group_spec.items(): 329 | if field == "_id": 330 | continue 331 | 332 | if agg["$sum"] in item: 333 | if field not in groups[group_key]: 334 | groups[group_key][field] = 0 335 | groups[group_key][field] += item[agg["$sum"]] 336 | elif agg["$avg"] in item: 337 | if field not in groups[group_key]: 338 | groups[group_key][field] = {"sum": 0, "count": 0} 339 | groups[group_key][field]["sum"] += item[agg["$avg"]] 340 | groups[group_key][field]["count"] += 1 341 | elif agg["$min"] in item: 342 | if field not in groups[group_key]: 343 | groups[group_key][field] = float('inf') 344 | groups[group_key][field] = min(groups[group_key][field], item[agg["$min"]]) 345 | elif agg["$max"] in item: 346 | if field not in groups[group_key]: 347 | groups[group_key][field] = float('-inf') 348 | groups[group_key][field] = max(groups[group_key][field], item[agg["$max"]]) 349 | 350 | # Convert averages 351 | for group in groups.values(): 352 | for field, value in group.items(): 353 | if isinstance(value, dict) and "sum" in value and "count" in value: 354 | group[field] = value["sum"] / value["count"] if value["count"] > 0 else 0 355 | 356 | return list(groups.values()) 357 | 358 | def get_query_stats(self) -> Dict[str, Any]: 359 | """Get query engine statistics.""" 360 | stats = { 361 | "text_indexes": len(self._text_index), 362 | "total_indexed_fields": sum(len(fields) for fields in self._text_index.values()), 363 | "total_indexed_words": sum( 364 | sum(len(words) for words in fields.values()) 365 | for fields in self._text_index.values() 366 | ) 367 | } 368 | 369 | return stats 370 | 371 | def clear_text_index(self, collection_name: str = None) -> None: 372 | """Clear text search index.""" 373 | if collection_name: 374 | if collection_name in self._text_index: 375 | del self._text_index[collection_name] 376 | logger.info(f"Text index cleared for collection '{collection_name}'") 377 | else: 378 | self._text_index.clear() 379 | logger.info("All text indexes cleared") 380 | -------------------------------------------------------------------------------- /benchmark_octanedb.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | OctaneDB Performance Benchmarking Script 4 | 5 | This script provides comprehensive performance measurements for OctaneDB 6 | with different configurations and dataset sizes. 7 | 8 | Usage: 9 | python benchmark_octanedb.py [--vectors N] [--dimension D] [--runs R] 10 | 11 | """ 12 | 13 | import argparse 14 | import time 15 | import statistics 16 | import numpy as np 17 | import psutil 18 | import os 19 | from typing import Dict, List, Tuple 20 | import json 21 | 22 | try: 23 | from octanedb import OctaneDB 24 | except ImportError: 25 | print(" OctaneDB not installed. Install with: pip install octanedb") 26 | exit(1) 27 | 28 | 29 | class OctaneDBBenchmark: 30 | """Comprehensive benchmarking for OctaneDB.""" 31 | 32 | def __init__(self, dimension: int = 384, storage_mode: str = "in-memory"): 33 | self.dimension = dimension 34 | self.storage_mode = storage_mode 35 | self.db = None 36 | self.results = {} 37 | 38 | def setup_database(self): 39 | """Initialize the database and collection.""" 40 | print(f" Setting up OctaneDB (dimension={self.dimension}, storage={self.storage_mode})") 41 | 42 | self.db = OctaneDB( 43 | dimension=self.dimension, 44 | storage_mode=self.storage_mode 45 | ) 46 | 47 | # Create benchmark collection 48 | collection = self.db.create_collection("benchmark") 49 | self.db.use_collection("benchmark") 50 | 51 | print(" Database setup complete") 52 | 53 | def measure_memory_usage(self) -> Dict[str, float]: 54 | """Measure current memory usage.""" 55 | process = psutil.Process(os.getpid()) 56 | memory_info = process.memory_info() 57 | 58 | return { 59 | "rss_mb": memory_info.rss / 1024 / 1024, # Resident Set Size 60 | "vms_mb": memory_info.vms / 1024 / 1024, # Virtual Memory Size 61 | "percent": process.memory_percent() 62 | } 63 | 64 | def benchmark_insertion(self, num_vectors: int, batch_size: int = 1000) -> Dict[str, float]: 65 | """Benchmark vector insertion performance.""" 66 | print(f"Benchmarking insertion of {num_vectors:,} vectors...") 67 | 68 | # Generate random vectors 69 | vectors = np.random.randn(num_vectors, self.dimension).astype(np.float32) 70 | metadata = [{"id": i, "benchmark": True} for i in range(num_vectors)] 71 | 72 | # Measure insertion time 73 | start_time = time.time() 74 | start_memory = self.measure_memory_usage() 75 | 76 | # Insert in batches 77 | inserted_count = 0 78 | for i in range(0, num_vectors, batch_size): 79 | end_idx = min(i + batch_size, num_vectors) 80 | batch_vectors = vectors[i:end_idx] 81 | batch_metadata = metadata[i:end_idx] 82 | 83 | self.db.insert( 84 | vectors=batch_vectors.tolist(), 85 | metadata=batch_metadata, 86 | ids=list(range(i, end_idx)) 87 | ) 88 | 89 | inserted_count += len(batch_vectors) 90 | 91 | # Progress update 92 | if (i + batch_size) % (batch_size * 10) == 0 or end_idx == num_vectors: 93 | elapsed = time.time() - start_time 94 | rate = inserted_count / elapsed 95 | print(f" Inserted {inserted_count:,}/{num_vectors:,} vectors ({rate:.0f} vectors/sec)") 96 | 97 | end_time = time.time() 98 | end_memory = self.measure_memory_usage() 99 | 100 | total_time = end_time - start_time 101 | insertion_rate = num_vectors / total_time 102 | 103 | return { 104 | "total_time": total_time, 105 | "insertion_rate": insertion_rate, 106 | "batch_size": batch_size, 107 | "start_memory_mb": start_memory["rss_mb"], 108 | "end_memory_mb": end_memory["rss_mb"], 109 | "memory_increase_mb": end_memory["rss_mb"] - start_memory["rss_mb"] 110 | } 111 | 112 | def benchmark_index_build(self) -> Dict[str, float]: 113 | """Benchmark HNSW index construction.""" 114 | print("Benchmarking index construction...") 115 | 116 | start_time = time.time() 117 | start_memory = self.measure_memory_usage() 118 | 119 | # Build the index 120 | self.db._current_collection._build_index() 121 | 122 | end_time = time.time() 123 | end_memory = self.measure_memory_usage() 124 | 125 | build_time = end_time - start_time 126 | 127 | # Get index statistics 128 | stats = self.db._current_collection.get_stats() 129 | index_stats = stats.get("index_stats", {}) 130 | 131 | return { 132 | "build_time": build_time, 133 | "start_memory_mb": start_memory["rss_mb"], 134 | "end_memory_mb": end_memory["rss_mb"], 135 | "memory_increase_mb": end_memory["rss_mb"] - start_memory["rss_mb"], 136 | "index_stats": index_stats 137 | } 138 | 139 | def benchmark_search(self, num_queries: int = 100, k: int = 10) -> Dict[str, float]: 140 | """Benchmark search performance.""" 141 | print(f" Benchmarking search ({num_queries} queries, k={k})...") 142 | 143 | # Generate random query vectors 144 | query_vectors = np.random.randn(num_queries, self.dimension).astype(np.float32) 145 | 146 | search_times = [] 147 | results_counts = [] 148 | 149 | start_time = time.time() 150 | 151 | for i, query_vector in enumerate(query_vectors): 152 | query_start = time.time() 153 | 154 | results = self.db.search(query_vector=query_vector, k=k) 155 | 156 | query_time = time.time() - query_start 157 | search_times.append(query_time * 1000) # Convert to milliseconds 158 | results_counts.append(len(results)) 159 | 160 | # Progress update 161 | if (i + 1) % 20 == 0: 162 | print(f" Processed {i + 1}/{num_queries} queries...") 163 | 164 | total_time = time.time() - start_time 165 | 166 | return { 167 | "total_time": total_time, 168 | "queries_per_second": num_queries / total_time, 169 | "avg_search_time_ms": statistics.mean(search_times), 170 | "median_search_time_ms": statistics.median(search_times), 171 | "min_search_time_ms": min(search_times), 172 | "max_search_time_ms": max(search_times), 173 | "search_time_std_ms": statistics.stdev(search_times) if len(search_times) > 1 else 0, 174 | "avg_results_count": statistics.mean(results_counts), 175 | "search_times_ms": search_times 176 | } 177 | 178 | def benchmark_batch_search(self, num_queries: int = 100, k: int = 10) -> Dict[str, float]: 179 | """Benchmark batch search performance.""" 180 | print(f" Benchmarking batch search ({num_queries} queries, k={k})...") 181 | 182 | # Generate random query vectors 183 | query_vectors = np.random.randn(num_queries, self.dimension).astype(np.float32) 184 | 185 | start_time = time.time() 186 | 187 | # Perform batch search 188 | batch_results = self.db.search_batch(query_vectors=query_vectors, k=k) 189 | 190 | total_time = time.time() - start_time 191 | 192 | # Count total results 193 | total_results = sum(len(results) for results in batch_results) 194 | 195 | return { 196 | "total_time": total_time, 197 | "queries_per_second": num_queries / total_time, 198 | "total_results": total_results, 199 | "avg_results_per_query": total_results / num_queries 200 | } 201 | 202 | def run_comprehensive_benchmark(self, num_vectors: int, num_runs: int = 3) -> Dict: 203 | """Run comprehensive benchmark with multiple runs.""" 204 | print(f" Starting comprehensive benchmark ({num_runs} runs)") 205 | print(f" Dataset: {num_vectors:,} vectors, {self.dimension} dimensions") 206 | print(f" Storage: {self.storage_mode}") 207 | print("=" * 60) 208 | 209 | all_results = { 210 | "insertion": [], 211 | "index_build": [], 212 | "search": [], 213 | "batch_search": [], 214 | "system_info": self._get_system_info() 215 | } 216 | 217 | for run in range(num_runs): 218 | print(f"\n Run {run + 1}/{num_runs}") 219 | print("-" * 40) 220 | 221 | # Reset database for each run 222 | self.setup_database() 223 | 224 | # Benchmark insertion 225 | insertion_result = self.benchmark_insertion(num_vectors) 226 | all_results["insertion"].append(insertion_result) 227 | 228 | # Benchmark index build 229 | index_result = self.benchmark_index_build() 230 | all_results["index_build"].append(index_result) 231 | 232 | # Benchmark search 233 | search_result = self.benchmark_search(100, 10) 234 | all_results["search"].append(search_result) 235 | 236 | # Benchmark batch search 237 | batch_result = self.benchmark_batch_search(100, 10) 238 | all_results["batch_search"].append(batch_result) 239 | 240 | print(f"Run {run + 1} completed") 241 | 242 | # Calculate aggregate results 243 | self.results = self._calculate_aggregate_results(all_results) 244 | 245 | return self.results 246 | 247 | def _get_system_info(self) -> Dict: 248 | """Get system information.""" 249 | return { 250 | "python_version": f"{os.sys.version_info.major}.{os.sys.version_info.minor}.{os.sys.version_info.micro}", 251 | "numpy_version": np.__version__, 252 | "cpu_count": os.cpu_count(), 253 | "memory_gb": psutil.virtual_memory().total / 1024 / 1024 / 1024, 254 | "platform": os.name 255 | } 256 | 257 | def _calculate_aggregate_results(self, all_results: Dict) -> Dict: 258 | """Calculate aggregate results from multiple runs.""" 259 | aggregated = {} 260 | 261 | for operation, runs in all_results.items(): 262 | if operation == "system_info": 263 | aggregated[operation] = runs 264 | continue 265 | 266 | aggregated[operation] = {} 267 | 268 | # Get all numeric values for each metric 269 | for metric in runs[0].keys(): 270 | if isinstance(runs[0][metric], (int, float)): 271 | values = [run[metric] for run in runs] 272 | aggregated[operation][f"{metric}_mean"] = statistics.mean(values) 273 | aggregated[operation][f"{metric}_median"] = statistics.median(values) 274 | aggregated[operation][f"{metric}_min"] = min(values) 275 | aggregated[operation][f"{metric}_max"] = max(values) 276 | if len(values) > 1: 277 | aggregated[operation][f"{metric}_std"] = statistics.stdev(values) 278 | else: 279 | aggregated[operation][f"{metric}_std"] = 0 280 | else: 281 | # For non-numeric values, just take the first run 282 | aggregated[operation][metric] = runs[0][metric] 283 | 284 | return aggregated 285 | 286 | def print_results(self): 287 | """Print formatted benchmark results.""" 288 | if not self.results: 289 | print(" No results to display. Run benchmark first.") 290 | return 291 | 292 | print("\n" + "=" * 60) 293 | print(" OCTANEDB BENCHMARK RESULTS") 294 | print("=" * 60) 295 | 296 | # System Information 297 | print(f"\n System Information:") 298 | sys_info = self.results["system_info"] 299 | print(f" Python: {sys_info['python_version']}") 300 | print(f" NumPy: {sys_info['numpy_version']}") 301 | print(f" CPU Cores: {sys_info['cpu_count']}") 302 | print(f" Memory: {sys_info['memory_gb']:.1f} GB") 303 | print(f" Platform: {sys_info['platform']}") 304 | 305 | # Insertion Results 306 | print(f"\nInsertion Performance:") 307 | insertion = self.results["insertion"] 308 | print(f" Rate: {insertion['insertion_rate_mean']:.0f} ± {insertion['insertion_rate_std']:.0f} vectors/sec") 309 | print(f" Total Time: {insertion['total_time_mean']:.2f} ± {insertion['total_time_std']:.2f} seconds") 310 | print(f" Memory Increase: {insertion['memory_increase_mb_mean']:.1f} ± {insertion['memory_increase_mb_std']:.1f} MB") 311 | 312 | # Index Build Results 313 | print(f"\n Index Build Performance:") 314 | index_build = self.results["index_build"] 315 | print(f" Build Time: {index_build['build_time_mean']:.2f} ± {index_build['build_time_std']:.2f} seconds") 316 | print(f" Memory Increase: {index_build['memory_increase_mb_mean']:.1f} ± {index_build['memory_increase_mb_std']:.1f} MB") 317 | 318 | # Search Results 319 | print(f"\n Search Performance:") 320 | search = self.results["search"] 321 | print(f" Single Query: {search['avg_search_time_ms_mean']:.2f} ± {search['search_time_std_ms_mean']:.2f} ms") 322 | print(f" Queries/sec: {search['queries_per_second_mean']:.1f} ± {search['queries_per_second_std']:.1f}") 323 | print(f" Min Time: {search['min_search_time_ms_mean']:.2f} ms") 324 | print(f" Max Time: {search['max_search_time_ms_mean']:.2f} ms") 325 | 326 | # Batch Search Results 327 | print(f"\n Batch Search Performance:") 328 | batch_search = self.results["batch_search"] 329 | print(f" Batch Queries/sec: {batch_search['queries_per_second_mean']:.1f} ± {batch_search['queries_per_second_std']:.1f}") 330 | print(f" Total Time: {batch_search['total_time_mean']:.2f} ± {batch_search['total_time_std']:.2f} seconds") 331 | 332 | print("\n" + "=" * 60) 333 | 334 | def save_results(self, filename: str = "octanedb_benchmark_results.json"): 335 | """Save results to JSON file.""" 336 | if not self.results: 337 | print(" No results to save. Run benchmark first.") 338 | return 339 | 340 | with open(filename, 'w') as f: 341 | json.dump(self.results, f, indent=2, default=str) 342 | 343 | print(f" Results saved to {filename}") 344 | 345 | 346 | def main(): 347 | """Main benchmark execution.""" 348 | parser = argparse.ArgumentParser( 349 | description="OctaneDB Performance Benchmarking", 350 | formatter_class=argparse.RawDescriptionHelpFormatter, 351 | epilog=""" 352 | Examples: 353 | python benchmark_octanedb.py --vectors 10000 --dimension 128 354 | python benchmark_octanedb.py --vectors 100000 --dimension 384 --runs 5 355 | python benchmark_octanedb.py --vectors 50000 --dimension 512 --storage persistent 356 | """ 357 | ) 358 | 359 | parser.add_argument( 360 | "--vectors", "-v", 361 | type=int, 362 | default=10000, 363 | help="Number of vectors to benchmark (default: 10000)" 364 | ) 365 | 366 | parser.add_argument( 367 | "--dimension", "-d", 368 | type=int, 369 | default=384, 370 | help="Vector dimension (default: 384)" 371 | ) 372 | 373 | parser.add_argument( 374 | "--runs", "-r", 375 | type=int, 376 | default=3, 377 | help="Number of benchmark runs (default: 3)" 378 | ) 379 | 380 | parser.add_argument( 381 | "--storage", "-s", 382 | choices=["in-memory", "persistent", "hybrid"], 383 | default="in-memory", 384 | help="Storage mode (default: in-memory)" 385 | ) 386 | 387 | parser.add_argument( 388 | "--save", "-o", 389 | action="store_true", 390 | help="Save results to JSON file" 391 | ) 392 | 393 | args = parser.parse_args() 394 | 395 | print(" OctaneDB Performance Benchmarking") 396 | print("=" * 50) 397 | 398 | # Run benchmark 399 | benchmark = OctaneDBBenchmark( 400 | dimension=args.dimension, 401 | storage_mode=args.storage 402 | ) 403 | 404 | try: 405 | results = benchmark.run_comprehensive_benchmark( 406 | num_vectors=args.vectors, 407 | num_runs=args.runs 408 | ) 409 | 410 | # Display results 411 | benchmark.print_results() 412 | 413 | # Save results if requested 414 | if args.save: 415 | benchmark.save_results() 416 | 417 | except KeyboardInterrupt: 418 | print("\n Benchmark interrupted by user") 419 | return 1 420 | except Exception as e: 421 | print(f"\n Benchmark failed: {e}") 422 | import traceback 423 | traceback.print_exc() 424 | return 1 425 | 426 | print("\n Benchmark completed successfully!") 427 | return 0 428 | 429 | 430 | if __name__ == "__main__": 431 | exit(main()) 432 | -------------------------------------------------------------------------------- /octanedb/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utility classes for vector operations, distance metrics, and helper functions. 3 | """ 4 | 5 | import numpy as np 6 | import logging 7 | from typing import List, Tuple, Optional, Dict, Any, Union 8 | from enum import Enum 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | class DistanceMetrics(Enum): 14 | """Supported distance metrics.""" 15 | COSINE = "cosine" 16 | EUCLIDEAN = "euclidean" 17 | DOT = "dot" 18 | MANHATTAN = "manhattan" 19 | CHEBYSHEV = "chebyshev" 20 | JACCARD = "jaccard" 21 | 22 | 23 | class VectorUtils: 24 | """ 25 | Utility class for vector operations and distance computations. 26 | 27 | Provides: 28 | - Fast distance calculations 29 | - Vector normalization 30 | - Batch operations 31 | - Performance optimizations 32 | """ 33 | 34 | def __init__(self, distance_metric: str = "cosine"): 35 | """ 36 | Initialize vector utilities. 37 | 38 | Args: 39 | distance_metric: Default distance metric to use 40 | """ 41 | self.distance_metric = distance_metric 42 | self._distance_func = self._get_distance_function() 43 | 44 | logger.info(f"Vector utilities initialized with metric: {distance_metric}") 45 | 46 | def _get_distance_function(self): 47 | """Get the appropriate distance function.""" 48 | if self.distance_metric == "cosine": 49 | return self.cosine_distance 50 | elif self.distance_metric == "euclidean": 51 | return self.euclidean_distance 52 | elif self.distance_metric == "dot": 53 | return self.dot_distance 54 | elif self.distance_metric == "manhattan": 55 | return self.manhattan_distance 56 | elif self.distance_metric == "chebyshev": 57 | return self.chebyshev_distance 58 | elif self.distance_metric == "jaccard": 59 | return self.jaccard_distance 60 | else: 61 | raise ValueError(f"Unsupported distance metric: {self.distance_metric}") 62 | 63 | @staticmethod 64 | def cosine_distance(a: np.ndarray, b: np.ndarray) -> float: 65 | """ 66 | Compute cosine distance between two vectors. 67 | 68 | Args: 69 | a: First vector 70 | b: Second vector 71 | 72 | Returns: 73 | Cosine distance (0 = identical, 2 = opposite) 74 | """ 75 | dot_product = np.dot(a, b) 76 | norm_a = np.linalg.norm(a) 77 | norm_b = np.linalg.norm(b) 78 | 79 | if norm_a == 0 or norm_b == 0: 80 | return 1.0 81 | 82 | cosine_similarity = dot_product / (norm_a * norm_b) 83 | return 1.0 - cosine_similarity 84 | 85 | @staticmethod 86 | def euclidean_distance(a: np.ndarray, b: np.ndarray) -> float: 87 | """ 88 | Compute Euclidean distance between two vectors. 89 | 90 | Args: 91 | a: First vector 92 | b: Second vector 93 | 94 | Returns: 95 | Euclidean distance 96 | """ 97 | return np.linalg.norm(a - b) 98 | 99 | @staticmethod 100 | def dot_distance(a: np.ndarray, b: np.ndarray) -> float: 101 | """ 102 | Compute negative dot product distance. 103 | 104 | Args: 105 | a: First vector 106 | b: Second vector 107 | 108 | Returns: 109 | Negative dot product 110 | """ 111 | return -np.dot(a, b) 112 | 113 | @staticmethod 114 | def manhattan_distance(a: np.ndarray, b: np.ndarray) -> float: 115 | """ 116 | Compute Manhattan (L1) distance between two vectors. 117 | 118 | Args: 119 | a: First vector 120 | b: Second vector 121 | 122 | Returns: 123 | Manhattan distance 124 | """ 125 | return np.sum(np.abs(a - b)) 126 | 127 | @staticmethod 128 | def chebyshev_distance(a: np.ndarray, b: np.ndarray) -> float: 129 | """ 130 | Compute Chebyshev (L∞) distance between two vectors. 131 | 132 | Args: 133 | a: First vector 134 | b: Second vector 135 | 136 | Returns: 137 | Chebyshev distance 138 | """ 139 | return np.max(np.abs(a - b)) 140 | 141 | @staticmethod 142 | def jaccard_distance(a: np.ndarray, b: np.ndarray) -> float: 143 | """ 144 | Compute Jaccard distance between two vectors. 145 | 146 | Args: 147 | a: First vector 148 | b: Second vector 149 | 150 | Returns: 151 | Jaccard distance 152 | """ 153 | intersection = np.sum(np.minimum(a, b)) 154 | union = np.sum(np.maximum(a, b)) 155 | 156 | if union == 0: 157 | return 1.0 158 | 159 | return 1.0 - (intersection / union) 160 | 161 | def compute_distance(self, a: np.ndarray, b: np.ndarray) -> float: 162 | """ 163 | Compute distance between two vectors using the configured metric. 164 | 165 | Args: 166 | a: First vector 167 | b: Second vector 168 | 169 | Returns: 170 | Distance value 171 | """ 172 | return self._distance_func(a, b) 173 | 174 | def compute_distances_batch( 175 | self, 176 | query: np.ndarray, 177 | vectors: np.ndarray 178 | ) -> np.ndarray: 179 | """ 180 | Compute distances between a query vector and multiple vectors. 181 | 182 | Args: 183 | query: Query vector 184 | vectors: Array of vectors to compare against 185 | 186 | Returns: 187 | Array of distances 188 | """ 189 | if self.distance_metric == "cosine": 190 | # Optimized cosine distance for batch computation 191 | query_norm = np.linalg.norm(query) 192 | if query_norm == 0: 193 | return np.ones(len(vectors)) 194 | 195 | # Normalize query 196 | query_normalized = query / query_norm 197 | 198 | # Compute norms for all vectors 199 | vector_norms = np.linalg.norm(vectors, axis=1) 200 | 201 | # Avoid division by zero 202 | valid_norms = vector_norms > 0 203 | distances = np.ones(len(vectors)) 204 | 205 | if np.any(valid_norms): 206 | # Compute dot products 207 | dot_products = np.dot(vectors[valid_norms], query_normalized) 208 | 209 | # Compute cosine similarities 210 | cosine_similarities = dot_products / vector_norms[valid_norms] 211 | 212 | # Convert to distances 213 | distances[valid_norms] = 1.0 - cosine_similarities 214 | 215 | return distances 216 | 217 | elif self.distance_metric == "euclidean": 218 | # Optimized Euclidean distance for batch computation 219 | return np.linalg.norm(vectors - query, axis=1) 220 | 221 | else: 222 | # Fallback to individual computation 223 | distances = np.zeros(len(vectors)) 224 | for i, vector in enumerate(vectors): 225 | distances[i] = self._distance_func(query, vector) 226 | return distances 227 | 228 | @staticmethod 229 | def normalize_vector(vector: np.ndarray) -> np.ndarray: 230 | """ 231 | Normalize a vector to unit length. 232 | 233 | Args: 234 | vector: Vector to normalize 235 | 236 | Returns: 237 | Normalized vector 238 | """ 239 | norm = np.linalg.norm(vector) 240 | if norm == 0: 241 | return vector 242 | return vector / norm 243 | 244 | @staticmethod 245 | def normalize_vectors(vectors: np.ndarray) -> np.ndarray: 246 | """ 247 | Normalize multiple vectors to unit length. 248 | 249 | Args: 250 | vectors: Array of vectors to normalize 251 | 252 | Returns: 253 | Array of normalized vectors 254 | """ 255 | norms = np.linalg.norm(vectors, axis=1, keepdims=True) 256 | # Avoid division by zero 257 | norms[norms == 0] = 1.0 258 | return vectors / norms 259 | 260 | @staticmethod 261 | def compute_centroid(vectors: np.ndarray) -> np.ndarray: 262 | """ 263 | Compute the centroid (mean) of multiple vectors. 264 | 265 | Args: 266 | vectors: Array of vectors 267 | 268 | Returns: 269 | Centroid vector 270 | """ 271 | if len(vectors) == 0: 272 | raise ValueError("Cannot compute centroid of empty vector set") 273 | 274 | return np.mean(vectors, axis=0) 275 | 276 | @staticmethod 277 | def compute_variance(vectors: np.ndarray) -> np.ndarray: 278 | """ 279 | Compute the variance of multiple vectors. 280 | 281 | Args: 282 | vectors: Array of vectors 283 | 284 | Returns: 285 | Variance vector 286 | """ 287 | if len(vectors) < 2: 288 | raise ValueError("Need at least 2 vectors to compute variance") 289 | 290 | return np.var(vectors, axis=0) 291 | 292 | @staticmethod 293 | def find_nearest_neighbors( 294 | query: np.ndarray, 295 | vectors: np.ndarray, 296 | k: int, 297 | distance_func: callable = None 298 | ) -> Tuple[np.ndarray, np.ndarray]: 299 | """ 300 | Find k nearest neighbors using brute force search. 301 | 302 | Args: 303 | query: Query vector 304 | vectors: Array of vectors to search in 305 | k: Number of neighbors to find 306 | distance_func: Distance function to use 307 | 308 | Returns: 309 | Tuple of (indices, distances) 310 | """ 311 | if distance_func is None: 312 | distance_func = VectorUtils.cosine_distance 313 | 314 | # Compute distances 315 | distances = np.array([ 316 | distance_func(query, vector) for vector in vectors 317 | ]) 318 | 319 | # Find k nearest neighbors 320 | if k >= len(vectors): 321 | indices = np.arange(len(vectors)) 322 | else: 323 | indices = np.argpartition(distances, k)[:k] 324 | 325 | # Sort by distance 326 | sorted_indices = indices[np.argsort(distances[indices])] 327 | sorted_distances = distances[sorted_indices] 328 | 329 | return sorted_indices, sorted_distances 330 | 331 | @staticmethod 332 | def compute_similarity_matrix(vectors: np.ndarray, metric: str = "cosine") -> np.ndarray: 333 | """ 334 | Compute similarity matrix between all pairs of vectors. 335 | 336 | Args: 337 | vectors: Array of vectors 338 | metric: Similarity metric to use 339 | 340 | Returns: 341 | Similarity matrix 342 | """ 343 | n = len(vectors) 344 | similarity_matrix = np.zeros((n, n)) 345 | 346 | if metric == "cosine": 347 | # Normalize vectors 348 | normalized_vectors = VectorUtils.normalize_vectors(vectors) 349 | 350 | # Compute cosine similarities 351 | similarity_matrix = np.dot(normalized_vectors, normalized_vectors.T) 352 | 353 | # Ensure diagonal is 1.0 (self-similarity) 354 | np.fill_diagonal(similarity_matrix, 1.0) 355 | 356 | elif metric == "dot": 357 | # Compute dot products 358 | similarity_matrix = np.dot(vectors, vectors.T) 359 | 360 | else: 361 | # Compute distances and convert to similarities 362 | for i in range(n): 363 | for j in range(n): 364 | if i == j: 365 | similarity_matrix[i, j] = 1.0 366 | else: 367 | distance = VectorUtils.compute_distance(vectors[i], vectors[j]) 368 | # Convert distance to similarity (inverse relationship) 369 | similarity_matrix[i, j] = 1.0 / (1.0 + distance) 370 | 371 | return similarity_matrix 372 | 373 | @staticmethod 374 | def compute_diversity_score(vectors: np.ndarray, metric: str = "cosine") -> float: 375 | """ 376 | Compute diversity score for a set of vectors. 377 | 378 | Args: 379 | vectors: Array of vectors 380 | metric: Distance metric to use 381 | 382 | Returns: 383 | Diversity score (higher = more diverse) 384 | """ 385 | if len(vectors) < 2: 386 | return 0.0 387 | 388 | # Compute pairwise distances 389 | total_distance = 0.0 390 | count = 0 391 | 392 | for i in range(len(vectors)): 393 | for j in range(i + 1, len(vectors)): 394 | distance = VectorUtils.compute_distance(vectors[i], vectors[j]) 395 | total_distance += distance 396 | count += 1 397 | 398 | # Return average distance 399 | return total_distance / count if count > 0 else 0.0 400 | 401 | @staticmethod 402 | def compute_quality_score( 403 | query: np.ndarray, 404 | results: np.ndarray, 405 | ground_truth: np.ndarray = None 406 | ) -> Dict[str, float]: 407 | """ 408 | Compute quality metrics for search results. 409 | 410 | Args: 411 | query: Query vector 412 | results: Retrieved result vectors 413 | ground_truth: Ground truth relevant vectors (optional) 414 | 415 | Returns: 416 | Dictionary of quality metrics 417 | """ 418 | metrics = {} 419 | 420 | # Compute average distance to query 421 | distances = np.array([ 422 | VectorUtils.cosine_distance(query, result) for result in results 423 | ]) 424 | 425 | metrics["avg_distance"] = float(np.mean(distances)) 426 | metrics["min_distance"] = float(np.min(distances)) 427 | metrics["max_distance"] = float(np.max(distances)) 428 | metrics["std_distance"] = float(np.std(distances)) 429 | 430 | # Compute diversity of results 431 | if len(results) > 1: 432 | metrics["diversity"] = VectorUtils.compute_diversity_score(results) 433 | 434 | # Compute precision/recall if ground truth is provided 435 | if ground_truth is not None: 436 | # This is a simplified implementation 437 | # In practice, you'd need relevance scores or binary relevance 438 | metrics["num_relevant"] = len(ground_truth) 439 | metrics["num_retrieved"] = len(results) 440 | 441 | return metrics 442 | 443 | def get_stats(self) -> Dict[str, Any]: 444 | """Get vector utilities statistics.""" 445 | return { 446 | "distance_metric": self.distance_metric, 447 | "supported_metrics": [metric.value for metric in DistanceMetrics] 448 | } 449 | 450 | 451 | class PerformanceOptimizer: 452 | """ 453 | Performance optimization utilities for vector operations. 454 | """ 455 | 456 | @staticmethod 457 | def optimize_batch_size( 458 | vector_dimension: int, 459 | available_memory: int = None, 460 | target_latency: float = 0.1 461 | ) -> int: 462 | """ 463 | Optimize batch size for vector operations. 464 | 465 | Args: 466 | vector_dimension: Dimension of vectors 467 | available_memory: Available memory in bytes 468 | target_latency: Target latency in seconds 469 | 470 | Returns: 471 | Optimal batch size 472 | """ 473 | # Estimate memory per vector (float32) 474 | bytes_per_vector = vector_dimension * 4 475 | 476 | if available_memory: 477 | # Use 80% of available memory 478 | usable_memory = int(available_memory * 0.8) 479 | max_batch_size = usable_memory // bytes_per_vector 480 | else: 481 | # Default to reasonable batch size 482 | max_batch_size = 10000 483 | 484 | # Adjust based on dimension 485 | if vector_dimension > 1000: 486 | max_batch_size = min(max_batch_size, 1000) 487 | elif vector_dimension > 100: 488 | max_batch_size = min(max_batch_size, 5000) 489 | 490 | return max(1, min(max_batch_size, 100000)) 491 | 492 | @staticmethod 493 | def estimate_memory_usage( 494 | num_vectors: int, 495 | vector_dimension: int, 496 | include_index: bool = True 497 | ) -> Dict[str, int]: 498 | """ 499 | Estimate memory usage for vector storage. 500 | 501 | Args: 502 | num_vectors: Number of vectors 503 | vector_dimension: Dimension of vectors 504 | include_index: Whether to include index memory 505 | 506 | Returns: 507 | Dictionary of memory usage estimates 508 | """ 509 | # Vector storage (float32) 510 | vector_memory = num_vectors * vector_dimension * 4 511 | 512 | # Metadata storage (estimated) 513 | metadata_memory = num_vectors * 100 # Rough estimate 514 | 515 | # Index memory 516 | index_memory = 0 517 | if include_index: 518 | # HNSW index memory (rough estimate) 519 | index_memory = num_vectors * vector_dimension * 2 520 | 521 | total_memory = vector_memory + metadata_memory + index_memory 522 | 523 | return { 524 | "vectors": vector_memory, 525 | "metadata": metadata_memory, 526 | "index": index_memory, 527 | "total": total_memory 528 | } 529 | 530 | @staticmethod 531 | def get_optimal_threads(num_vectors: int, vector_dimension: int) -> int: 532 | """ 533 | Get optimal number of threads for parallel processing. 534 | 535 | Args: 536 | num_vectors: Number of vectors 537 | vector_dimension: Dimension of vectors 538 | 539 | Returns: 540 | Optimal number of threads 541 | """ 542 | import os 543 | 544 | # Get CPU count 545 | cpu_count = os.cpu_count() or 1 546 | 547 | # Adjust based on workload 548 | if num_vectors < 1000 or vector_dimension < 100: 549 | return min(cpu_count, 2) 550 | elif num_vectors < 10000: 551 | return min(cpu_count, 4) 552 | else: 553 | return min(cpu_count, 8) 554 | -------------------------------------------------------------------------------- /octanedb/storage.py: -------------------------------------------------------------------------------- 1 | """ 2 | Storage manager for efficient persistence and loading of vector databases. 3 | """ 4 | 5 | import numpy as np 6 | import h5py 7 | import msgpack 8 | import logging 9 | import time 10 | from typing import Dict, Any, Optional, List, Union 11 | from pathlib import Path 12 | from collections import defaultdict 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | class StorageManager: 18 | """ 19 | Storage manager for efficient persistence and loading of vector databases. 20 | 21 | Features: 22 | - HDF5-based storage with compression 23 | - Efficient metadata serialization 24 | - Incremental saves and loads 25 | - Memory-mapped access for large datasets 26 | """ 27 | 28 | def __init__( 29 | self, 30 | storage_path: Optional[Path] = None, 31 | enable_cache: bool = True, 32 | cache_size: int = 10000, 33 | compression: str = "gzip", 34 | compression_opts: int = 6 35 | ): 36 | """ 37 | Initialize storage manager. 38 | 39 | Args: 40 | storage_path: Base path for storage 41 | enable_cache: Enable caching for better performance 42 | cache_size: Maximum cache size 43 | compression: Compression algorithm 44 | compression_opts: Compression options 45 | """ 46 | self.storage_path = storage_path 47 | self.enable_cache = enable_cache 48 | self.cache_size = cache_size 49 | self.compression = compression 50 | self.compression_opts = compression_opts 51 | 52 | # Cache for frequently accessed data 53 | self._cache: Dict[str, Any] = {} 54 | self._cache_hits = 0 55 | self._cache_misses = 0 56 | 57 | # Create storage directory if it doesn't exist 58 | if self.storage_path: 59 | self.storage_path.mkdir(parents=True, exist_ok=True) 60 | 61 | logger.info(f"Storage manager initialized at {self.storage_path}") 62 | 63 | def save_database(self, collections: Dict[str, Any], save_path: Path) -> None: 64 | """ 65 | Save the entire database to disk. 66 | 67 | Args: 68 | collections: Dictionary of collections to save 69 | save_path: Path to save the database 70 | """ 71 | start_time = time.time() 72 | 73 | # Create HDF5 file 74 | with h5py.File(save_path, 'w') as f: 75 | # Save database metadata 76 | self._save_database_metadata(f, collections) 77 | 78 | # Save each collection 79 | for collection_name, collection in collections.items(): 80 | self._save_collection(f, collection_name, collection) 81 | 82 | save_time = time.time() - start_time 83 | logger.info(f"Database saved to {save_path} in {save_time:.4f}s") 84 | 85 | def _save_database_metadata(self, f: h5py.File, collections: Dict[str, Any]) -> None: 86 | """Save database-level metadata.""" 87 | # Database info 88 | db_info = { 89 | "version": "0.1.0", 90 | "created_at": time.time(), 91 | "num_collections": len(collections), 92 | "total_vectors": sum(c.count() for c in collections.values()) 93 | } 94 | 95 | # Collection metadata 96 | collection_metadata = {} 97 | for name, collection in collections.items(): 98 | collection_metadata[name] = { 99 | "dimension": collection.dimension, 100 | "index_type": collection.index_type, 101 | "m": collection.m, 102 | "ef_construction": collection.ef_construction, 103 | "ef_search": collection.ef_search, 104 | "max_elements": collection.max_elements, 105 | "distance_metric": collection.distance_metric, 106 | "vector_count": collection.count(), 107 | "metadata_count": len(collection._metadata) 108 | } 109 | 110 | # Save as datasets to avoid HDF5 attribute limitations 111 | f.create_dataset('db_info', data=np.frombuffer(msgpack.packb(db_info), dtype=np.uint8)) 112 | f.create_dataset('collection_metadata', data=np.frombuffer(msgpack.packb(collection_metadata), dtype=np.uint8)) 113 | 114 | def _save_collection(self, f: h5py.File, collection_name: str, collection: Any) -> None: 115 | """Save a single collection.""" 116 | # Create collection group 117 | collection_group = f.create_group(collection_name) 118 | 119 | # Save collection metadata 120 | collection_group.attrs['name'] = collection_name 121 | collection_group.attrs['dimension'] = collection.dimension 122 | collection_group.attrs['index_type'] = collection.index_type 123 | collection_group.attrs['distance_metric'] = collection.distance_metric 124 | 125 | # Save vectors 126 | if collection._vectors: 127 | vectors_array = np.array([v for v in collection._vectors.values()], dtype=np.float32) 128 | vector_ids = list(collection._vectors.keys()) 129 | 130 | # Save vectors with compression 131 | collection_group.create_dataset( 132 | 'vectors', 133 | data=vectors_array, 134 | compression=self.compression, 135 | compression_opts=self.compression_opts 136 | ) 137 | 138 | # Save vector IDs 139 | collection_group.create_dataset( 140 | 'vector_ids', 141 | data=vector_ids, 142 | compression=self.compression, 143 | compression_opts=self.compression_opts 144 | ) 145 | 146 | # Save metadata 147 | if collection._metadata: 148 | # Convert integer keys to strings for msgpack compatibility 149 | metadata_for_storage = {str(k): v for k, v in collection._metadata.items()} 150 | # Convert metadata to bytes 151 | metadata_bytes = msgpack.packb(metadata_for_storage) 152 | collection_group.create_dataset( 153 | 'metadata', 154 | data=np.frombuffer(metadata_bytes, dtype=np.uint8), 155 | compression=self.compression, 156 | compression_opts=self.compression_opts 157 | ) 158 | 159 | # Save index if built 160 | if collection._index and collection._index_built: 161 | self._save_index(collection_group, collection._index) 162 | 163 | # Save collection stats 164 | stats = collection.get_stats() 165 | collection_group.create_dataset('stats', data=np.frombuffer(msgpack.packb(stats), dtype=np.uint8)) 166 | 167 | def _save_index(self, collection_group: h5py.Group, index) -> None: 168 | """Save index data.""" 169 | if hasattr(index, 'get_stats'): 170 | index_stats = index.get_stats() 171 | collection_group.create_dataset('index_stats', data=np.frombuffer(msgpack.packb(index_stats), dtype=np.uint8)) 172 | 173 | def load_database(self, load_path: Path) -> Dict[str, Any]: 174 | """ 175 | Load the entire database from disk. 176 | 177 | Args: 178 | load_path: Path to load the database from 179 | 180 | Returns: 181 | Dictionary of loaded collections 182 | """ 183 | start_time = time.time() 184 | 185 | collections = {} 186 | 187 | with h5py.File(load_path, 'r') as f: 188 | # Load database metadata from datasets 189 | db_info = msgpack.unpackb(bytes(f['db_info'][:])) 190 | collection_metadata = msgpack.unpackb(bytes(f['collection_metadata'][:])) 191 | 192 | # Load each collection 193 | for collection_name in f.keys(): 194 | if collection_name not in ['db_info', 'collection_metadata']: 195 | collection = self._load_collection(f[collection_name], collection_name) 196 | collections[collection_name] = collection 197 | 198 | load_time = time.time() - start_time 199 | logger.info(f"Database loaded from {load_path} in {load_time:.4f}s") 200 | 201 | return collections 202 | 203 | def load_database_metadata(self, load_path: Path) -> Dict[str, Any]: 204 | """Load only database metadata without loading vectors.""" 205 | with h5py.File(load_path, 'r') as f: 206 | db_info = msgpack.unpackb(bytes(f['db_info'][:])) 207 | collection_metadata = msgpack.unpackb(bytes(f['collection_metadata'][:])) 208 | 209 | # Combine metadata 210 | metadata = db_info.copy() 211 | metadata['collections'] = collection_metadata 212 | 213 | return metadata 214 | 215 | def _load_collection(self, collection_group: h5py.Group, collection_name: str) -> Any: 216 | """Load a single collection.""" 217 | # Get collection metadata from attributes (primary source) 218 | dimension = collection_group.attrs.get('dimension') 219 | index_type = collection_group.attrs.get('index_type', 'hnsw') 220 | distance_metric = collection_group.attrs.get('distance_metric', 'cosine') 221 | 222 | # Validate that we have the essential dimension 223 | if dimension is None: 224 | raise ValueError(f"Collection '{collection_name}' missing dimension information") 225 | 226 | # Create collection using string import to avoid circular dependency 227 | from .collection import Collection 228 | collection = Collection( 229 | name=collection_name, 230 | dimension=dimension, 231 | index_type=index_type, 232 | distance_metric=distance_metric 233 | ) 234 | 235 | # Load vectors 236 | if 'vectors' in collection_group and 'vector_ids' in collection_group: 237 | vectors = collection_group['vectors'][:] 238 | vector_ids = collection_group['vector_ids'][:].tolist() 239 | 240 | # Restore vectors 241 | for i, vector_id in enumerate(vector_ids): 242 | collection._vectors[vector_id] = vectors[i] 243 | collection._next_id = max(collection._next_id, vector_id + 1) 244 | 245 | # Load metadata 246 | if 'metadata' in collection_group: 247 | metadata_bytes = collection_group['metadata'][:].tobytes() 248 | loaded_metadata = msgpack.unpackb(metadata_bytes) 249 | # Convert string keys back to integers 250 | collection._metadata = {int(k): v for k, v in loaded_metadata.items()} 251 | 252 | # Load stats (already loaded above for dimension, etc.) 253 | if 'stats' in collection_group: 254 | stats_bytes = collection_group['stats'][:].tobytes() 255 | stats = msgpack.unpackb(stats_bytes) 256 | collection._stats.update(stats) 257 | 258 | # Mark index as needing rebuild 259 | collection._index_built = False 260 | collection._index_needs_rebuild = True 261 | 262 | logger.debug(f"Loaded collection '{collection_name}' with {len(collection._vectors)} vectors") 263 | 264 | return collection 265 | 266 | def save_collection(self, collection: Any, collection_name: str) -> None: 267 | """ 268 | Save a single collection. 269 | 270 | Args: 271 | collection: Collection object to save 272 | collection_name: Name of the collection 273 | """ 274 | if not self.storage_path: 275 | raise ValueError("Storage path not set") 276 | 277 | collection_path = self.storage_path / f"{collection_name}.h5" 278 | 279 | with h5py.File(collection_path, 'w') as f: 280 | self._save_collection(f, collection_name, collection) 281 | 282 | logger.info(f"Collection '{collection_name}' saved to {collection_path}") 283 | 284 | def load_collection(self, collection_name: str) -> Any: 285 | """ 286 | Load a single collection. 287 | 288 | Args: 289 | collection_name: Name of the collection to load 290 | 291 | Returns: 292 | Loaded collection 293 | """ 294 | if not self.storage_path: 295 | raise ValueError("Storage path not set") 296 | 297 | collection_path = self.storage_path / f"{collection_name}.h5" 298 | 299 | if not collection_path.exists(): 300 | raise FileNotFoundError(f"Collection file not found: {collection_path}") 301 | 302 | with h5py.File(collection_path, 'r') as f: 303 | collection = self._load_collection(f, collection_name) 304 | 305 | logger.info(f"Collection '{collection_name}' loaded from {collection_path}") 306 | return collection 307 | 308 | def delete_collection(self, collection_name: str) -> None: 309 | """ 310 | Delete a collection from storage. 311 | 312 | Args: 313 | collection_name: Name of the collection to delete 314 | """ 315 | if not self.storage_path: 316 | return 317 | 318 | collection_path = self.storage_path / f"{collection_name}.h5" 319 | 320 | if collection_path.exists(): 321 | collection_path.unlink() 322 | logger.info(f"Collection '{collection_name}' deleted from storage") 323 | 324 | def list_saved_collections(self) -> List[str]: 325 | """List all saved collections.""" 326 | if not self.storage_path: 327 | return [] 328 | 329 | collections = [] 330 | for file_path in self.storage_path.glob("*.h5"): 331 | collections.append(file_path.stem) 332 | 333 | return collections 334 | 335 | def get_collection_info(self, collection_name: str) -> Optional[Dict[str, Any]]: 336 | """ 337 | Get information about a saved collection without loading it. 338 | 339 | Args: 340 | collection_name: Name of the collection 341 | 342 | Returns: 343 | Collection information or None if not found 344 | """ 345 | if not self.storage_path: 346 | return None 347 | 348 | collection_path = self.storage_path / f"{collection_name}.h5" 349 | 350 | if not collection_path.exists(): 351 | return None 352 | 353 | try: 354 | with h5py.File(collection_path, 'r') as f: 355 | info = { 356 | 'name': collection_name, 357 | 'dimension': f.attrs.get('dimension'), 358 | 'index_type': f.attrs.get('index_type'), 359 | 'distance_metric': f.attrs.get('distance_metric'), 360 | 'vector_count': f['vectors'].shape[0] if 'vectors' in f else 0, 361 | 'metadata_count': len(f['metadata']) if 'metadata' in f else 0 362 | } 363 | 364 | if 'stats' in f.attrs: 365 | stats = msgpack.unpackb(f.attrs['stats']) 366 | info.update(stats) 367 | 368 | return info 369 | except Exception as e: 370 | logger.error(f"Error reading collection info for '{collection_name}': {e}") 371 | return None 372 | 373 | def optimize_storage(self, collection_name: str) -> None: 374 | """ 375 | Optimize storage for a collection (e.g., recompress with better settings). 376 | 377 | Args: 378 | collection_name: Name of the collection to optimize 379 | """ 380 | if not self.storage_path: 381 | return 382 | 383 | collection_path = self.storage_path / f"{collection_name}.h5" 384 | 385 | if not collection_path.exists(): 386 | logger.warning(f"Collection '{collection_name}' not found for optimization") 387 | return 388 | 389 | # Load and resave with optimized compression 390 | try: 391 | with h5py.File(collection_path, 'r') as f: 392 | collection = self._load_collection(f, collection_name) 393 | 394 | # Save with optimized settings 395 | optimized_path = collection_path.with_suffix('.optimized.h5') 396 | with h5py.File(optimized_path, 'w') as f: 397 | self._save_collection(f, collection_name, collection) 398 | 399 | # Replace original file 400 | collection_path.unlink() 401 | optimized_path.rename(collection_path) 402 | 403 | logger.info(f"Storage optimized for collection '{collection_name}'") 404 | 405 | except Exception as e: 406 | logger.error(f"Error optimizing storage for collection '{collection_name}': {e}") 407 | 408 | def get_storage_stats(self) -> Dict[str, Any]: 409 | """Get storage statistics.""" 410 | stats = { 411 | 'storage_path': str(self.storage_path) if self.storage_path else None, 412 | 'cache_enabled': self.enable_cache, 413 | 'cache_size': self.cache_size, 414 | 'cache_hits': self._cache_hits, 415 | 'cache_misses': self._cache_misses, 416 | 'cache_hit_rate': self._cache_hits / (self._cache_hits + self._cache_misses) if (self._cache_hits + self._cache_misses) > 0 else 0 417 | } 418 | 419 | if self.storage_path: 420 | # File system stats 421 | total_size = 0 422 | file_count = 0 423 | for file_path in self.storage_path.glob("*.h5"): 424 | total_size += file_path.stat().st_size 425 | file_count += 1 426 | 427 | stats['total_storage_size'] = total_size 428 | stats['file_count'] = file_count 429 | stats['saved_collections'] = self.list_saved_collections() 430 | 431 | return stats 432 | 433 | def clear_cache(self) -> None: 434 | """Clear the storage cache.""" 435 | self._cache.clear() 436 | self._cache_hits = 0 437 | self._cache_misses = 0 438 | logger.info("Storage cache cleared") 439 | 440 | def _get_cache_key(self, key: str) -> str: 441 | """Generate a cache key.""" 442 | return f"storage_{key}" 443 | 444 | def _get_from_cache(self, key: str) -> Optional[Any]: 445 | """Get value from cache.""" 446 | if not self.enable_cache: 447 | return None 448 | 449 | cache_key = self._get_cache_key(key) 450 | if cache_key in self._cache: 451 | self._cache_hits += 1 452 | return self._cache[cache_key] 453 | 454 | self._cache_misses += 1 455 | return None 456 | 457 | def _set_cache(self, key: str, value: Any) -> None: 458 | """Set value in cache.""" 459 | if not self.enable_cache: 460 | return 461 | 462 | cache_key = self._get_cache_key(key) 463 | 464 | # Implement LRU cache 465 | if len(self._cache) >= self.cache_size: 466 | # Remove oldest item (simple implementation) 467 | oldest_key = next(iter(self._cache)) 468 | del self._cache[oldest_key] 469 | 470 | self._cache[cache_key] = value 471 | -------------------------------------------------------------------------------- /octanedb/index.py: -------------------------------------------------------------------------------- 1 | """ 2 | Index implementations for fast vector similarity search. 3 | """ 4 | 5 | import numpy as np 6 | import logging 7 | from typing import List, Tuple, Optional, Dict, Any 8 | from enum import Enum 9 | import time 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | class IndexType(Enum): 15 | """Supported index types.""" 16 | HNSW = "hnsw" 17 | FLAT = "flat" 18 | IVF = "ivf" 19 | 20 | 21 | class HNSWIndex: 22 | """ 23 | Hierarchical Navigable Small World (HNSW) index for fast vector similarity search. 24 | 25 | HNSW is a graph-based index that provides: 26 | - Sub-linear search complexity 27 | - High accuracy for similarity search 28 | - Efficient construction and updates 29 | - Configurable search depth and connections 30 | """ 31 | 32 | def __init__( 33 | self, 34 | dimension: int, 35 | m: int = 16, 36 | ef_construction: int = 200, 37 | ef_search: int = 100, 38 | max_elements: int = 1000000, 39 | distance_metric: str = "cosine" 40 | ): 41 | """ 42 | Initialize HNSW index. 43 | 44 | Args: 45 | dimension: Vector dimension 46 | m: Maximum number of connections per layer 47 | ef_construction: Construction search depth 48 | ef_search: Search depth 49 | max_elements: Maximum number of vectors 50 | distance_metric: Distance metric for similarity 51 | """ 52 | self.dimension = dimension 53 | self.m = m 54 | self.ef_construction = ef_construction 55 | self.ef_search = ef_search 56 | self.max_elements = max_elements 57 | self.distance_metric = distance_metric 58 | 59 | # Graph structure 60 | self._layers: List[Dict[int, List[int]]] = [] # Adjacency lists for each layer 61 | self._vectors: Optional[np.ndarray] = None 62 | self._vector_ids: Optional[List[int]] = None 63 | self._max_layer: int = 0 64 | 65 | # Entry point 66 | self._entry_point: Optional[int] = None 67 | self._entry_layer: int = 0 68 | 69 | # Distance computation 70 | self._distance_func = self._get_distance_function() 71 | 72 | logger.info(f"HNSW index initialized with m={m}, ef_construction={ef_construction}") 73 | 74 | def _get_distance_function(self): 75 | """Get the appropriate distance function.""" 76 | if self.distance_metric == "cosine": 77 | return self._cosine_distance 78 | elif self.distance_metric == "euclidean": 79 | return self._euclidean_distance 80 | elif self.distance_metric == "dot": 81 | return self._dot_distance 82 | else: 83 | raise ValueError(f"Unsupported distance metric: {self.distance_metric}") 84 | 85 | def _cosine_distance(self, a: np.ndarray, b: np.ndarray) -> float: 86 | """Compute cosine distance between vectors.""" 87 | dot_product = np.dot(a, b) 88 | norm_a = np.linalg.norm(a) 89 | norm_b = np.linalg.norm(b) 90 | if norm_a == 0 or norm_b == 0: 91 | return 1.0 92 | return 1.0 - (dot_product / (norm_a * norm_b)) 93 | 94 | def _euclidean_distance(self, a: np.ndarray, b: np.ndarray) -> float: 95 | """Compute Euclidean distance between vectors.""" 96 | return np.linalg.norm(a - b) 97 | 98 | def _dot_distance(self, a: np.ndarray, b: np.ndarray) -> float: 99 | """Compute negative dot product distance.""" 100 | return -np.dot(a, b) 101 | 102 | def build(self, vectors: np.ndarray, vector_ids: List[int]) -> None: 103 | """ 104 | Build the HNSW index from vectors. 105 | 106 | Args: 107 | vectors: Array of vectors to index 108 | vector_ids: Corresponding vector IDs 109 | """ 110 | if len(vectors) == 0: 111 | logger.warning("No vectors to index") 112 | return 113 | 114 | start_time = time.time() 115 | 116 | # Store vectors and IDs 117 | self._vectors = vectors.astype(np.float32) 118 | self._vector_ids = vector_ids 119 | 120 | # Initialize layers 121 | self._layers = [{} for _ in range(self._get_max_layer(len(vectors)) + 1)] 122 | 123 | # Insert vectors one by one 124 | for i in range(len(vectors)): 125 | self._insert_vector(i, vector_ids[i]) 126 | logger.debug(f"Inserted vector {i} (ID: {vector_ids[i]}) at layer {self._layers[-1] if self._layers else 'N/A'}") 127 | 128 | build_time = time.time() - start_time 129 | logger.info(f"HNSW index built for {len(vectors)} vectors in {build_time:.4f}s") 130 | logger.info(f"Final entry point: {self._entry_point}, entry layer: {self._entry_layer}") 131 | logger.info(f"Number of layers: {len(self._layers)}") 132 | 133 | def _get_max_layer(self, num_vectors: int) -> int: 134 | """Calculate maximum layer for the given number of vectors.""" 135 | return max(0, int(np.log(num_vectors) / np.log(self.m))) 136 | 137 | def _insert_vector(self, vector_idx: int, vector_id: int) -> None: 138 | """Insert a single vector into the index.""" 139 | # Determine layer for this vector 140 | layer = self._get_random_layer() 141 | logger.debug(f"Vector {vector_idx} (ID: {vector_id}) assigned to layer {layer}") 142 | 143 | # Find nearest neighbors in current layer and lower layers 144 | if self._entry_point is not None: 145 | nearest = self._search_layer( 146 | self._vectors[vector_idx], 147 | self._entry_point, 148 | self.ef_construction, 149 | layer 150 | ) 151 | 152 | # If no neighbors found in current layer, search in lower layers 153 | if not nearest and layer > 0: 154 | for lower_layer in range(layer - 1, -1, -1): 155 | lower_nearest = self._search_layer( 156 | self._vectors[vector_idx], 157 | self._entry_point, 158 | self.ef_construction, 159 | lower_layer 160 | ) 161 | if lower_nearest: 162 | nearest = lower_nearest 163 | break 164 | 165 | # If still no neighbors found, create a simple connection to the entry point 166 | if not nearest: 167 | nearest = [(self._entry_point, self._distance_func( 168 | self._vectors[vector_idx], 169 | self._vectors[self._entry_point] 170 | ))] 171 | else: 172 | nearest = [] 173 | 174 | # Add connections in current layer 175 | self._add_connections(vector_idx, nearest, layer) 176 | 177 | # Update entry point if needed 178 | if self._entry_point is None or layer > self._entry_layer: 179 | self._entry_point = vector_idx 180 | self._entry_layer = layer 181 | logger.debug(f"Updated entry point to {vector_idx} at layer {layer}") 182 | 183 | # Add to layers 184 | for l in range(layer + 1): 185 | if l not in self._layers: 186 | self._layers.append({}) 187 | self._layers[l][vector_idx] = [] 188 | 189 | # Ensure the vector is also added to the bottom layer (layer 0) for searchability 190 | if 0 not in self._layers: 191 | self._layers.append({}) 192 | if vector_idx not in self._layers[0]: 193 | self._layers[0][vector_idx] = [] 194 | 195 | def _get_random_layer(self) -> int: 196 | """Get random layer based on exponential distribution.""" 197 | # Use a smaller factor to avoid extremely high layers 198 | # The original used self.m which was too large 199 | factor = 1.0 # This gives more reasonable layer distribution 200 | return int(-np.log(np.random.random()) * factor) 201 | 202 | def _search_layer( 203 | self, 204 | query: np.ndarray, 205 | entry_point: int, 206 | ef: int, 207 | layer: int 208 | ) -> List[Tuple[int, float]]: 209 | """ 210 | Search for nearest neighbors in a specific layer. 211 | 212 | Args: 213 | query: Query vector 214 | entry_point: Starting point for search 215 | ef: Search depth 216 | layer: Layer to search in 217 | 218 | Returns: 219 | List of (vector_idx, distance) tuples 220 | """ 221 | if layer >= len(self._layers): 222 | return [] 223 | 224 | # Initialize candidates and visited 225 | candidates = [(entry_point, self._distance_func(query, self._vectors[entry_point]))] 226 | visited = {entry_point} 227 | 228 | # Search loop 229 | while candidates: 230 | # Get closest candidate 231 | current_idx, current_dist = candidates.pop(0) 232 | 233 | # Check if we can improve 234 | if len(candidates) >= ef and current_dist > candidates[-1][1]: 235 | break 236 | 237 | # Explore neighbors 238 | for neighbor_idx in self._layers[layer].get(current_idx, []): 239 | if neighbor_idx in visited: 240 | continue 241 | 242 | visited.add(neighbor_idx) 243 | neighbor_dist = self._distance_func(query, self._vectors[neighbor_idx]) 244 | 245 | # Add to candidates if better than worst 246 | if len(candidates) < ef or neighbor_dist < candidates[-1][1]: 247 | # Insert in sorted order 248 | insert_pos = 0 249 | for i, (_, dist) in enumerate(candidates): 250 | if neighbor_dist < dist: 251 | insert_pos = i 252 | break 253 | 254 | candidates.insert(insert_pos, (neighbor_idx, neighbor_dist)) 255 | 256 | # Keep only top ef candidates 257 | if len(candidates) > ef: 258 | candidates = candidates[:ef] 259 | 260 | return candidates 261 | 262 | def _add_connections(self, vector_idx: int, nearest: List[Tuple[int, float]], layer: int) -> None: 263 | """Add connections for a vector in a specific layer.""" 264 | if layer >= len(self._layers): 265 | return 266 | 267 | # Sort by distance 268 | nearest.sort(key=lambda x: x[1]) 269 | 270 | # Add bidirectional connections in the current layer 271 | for neighbor_idx, _ in nearest[:self.m]: 272 | # Add connection from vector to neighbor 273 | if vector_idx not in self._layers[layer]: 274 | self._layers[layer][vector_idx] = [] 275 | self._layers[layer][vector_idx].append(neighbor_idx) 276 | 277 | # Add connection from neighbor to vector 278 | if neighbor_idx not in self._layers[layer]: 279 | self._layers[layer][neighbor_idx] = [] 280 | self._layers[layer][neighbor_idx].append(vector_idx) 281 | 282 | # Also add connections in the bottom layer (layer 0) for better searchability 283 | if layer > 0 and 0 < len(self._layers): 284 | for neighbor_idx, _ in nearest[:self.m]: 285 | # Add connection from vector to neighbor in bottom layer 286 | if vector_idx not in self._layers[0]: 287 | self._layers[0][vector_idx] = [] 288 | if neighbor_idx not in self._layers[0][vector_idx]: 289 | self._layers[0][vector_idx].append(neighbor_idx) 290 | 291 | # Add connection from neighbor to vector in bottom layer 292 | if neighbor_idx not in self._layers[0]: 293 | self._layers[0][neighbor_idx] = [] 294 | if vector_idx not in self._layers[0][neighbor_idx]: 295 | self._layers[0][neighbor_idx].append(vector_idx) 296 | 297 | def search(self, query: np.ndarray, k: int) -> List[Tuple[int, float]]: 298 | """ 299 | Search for k nearest neighbors. 300 | 301 | Args: 302 | query: Query vector 303 | k: Number of results to return 304 | 305 | Returns: 306 | List of (vector_id, distance) tuples 307 | """ 308 | if self._entry_point is None: 309 | logger.warning("HNSW search failed: entry_point is None") 310 | return [] 311 | 312 | logger.debug(f"HNSW search: entry_point={self._entry_point}, entry_layer={self._entry_layer}, layers={len(self._layers)}") 313 | 314 | # Start from top layer 315 | current_layer = self._entry_layer 316 | current_point = self._entry_point 317 | 318 | # Search down through layers 319 | while current_layer > 0: 320 | nearest = self._search_layer(query, current_point, 1, current_layer) 321 | if nearest: 322 | current_point = nearest[0][0] 323 | current_layer -= 1 324 | 325 | # Search in bottom layer (layer 0) where most connections are 326 | nearest = self._search_layer(query, current_point, self.ef_search, 0) 327 | 328 | # If no results found in bottom layer, try searching from all vectors in bottom layer 329 | if not nearest and len(self._layers) > 0 and self._layers[0]: 330 | all_candidates = [] 331 | for vector_idx in self._layers[0].keys(): 332 | distance = self._distance_func(query, self._vectors[vector_idx]) 333 | all_candidates.append((vector_idx, distance)) 334 | 335 | # Sort by distance and take top k 336 | all_candidates.sort(key=lambda x: x[1]) 337 | nearest = all_candidates[:k] 338 | 339 | # Convert to vector IDs and return top k 340 | results = [] 341 | for vector_idx, distance in nearest[:k]: 342 | vector_id = self._vector_ids[vector_idx] 343 | results.append((vector_id, distance)) 344 | 345 | return results 346 | 347 | def search_batch(self, queries: np.ndarray, k: int) -> List[List[Tuple[int, float]]]: 348 | """ 349 | Batch search for multiple queries. 350 | 351 | Args: 352 | queries: Array of query vectors 353 | k: Number of results per query 354 | 355 | Returns: 356 | List of result lists for each query 357 | """ 358 | results = [] 359 | for query in queries: 360 | query_results = self.search(query, k) 361 | results.append(query_results) 362 | return results 363 | 364 | def optimize(self) -> None: 365 | """Optimize the index structure.""" 366 | # Rebalance connections if needed 367 | for layer in range(len(self._layers)): 368 | for vector_idx in list(self._layers[layer].keys()): 369 | connections = self._layers[layer][vector_idx] 370 | if len(connections) > self.m * 2: 371 | # Keep only closest connections 372 | if self._vectors is not None: 373 | distances = [ 374 | (neighbor_idx, self._distance_func( 375 | self._vectors[vector_idx], 376 | self._vectors[neighbor_idx] 377 | )) 378 | for neighbor_idx in connections 379 | ] 380 | distances.sort(key=lambda x: x[1]) 381 | best_connections = [idx for idx, _ in distances[:self.m]] 382 | 383 | # Update connections 384 | self._layers[layer][vector_idx] = best_connections 385 | 386 | # Remove reverse connections 387 | for neighbor_idx in best_connections: 388 | if neighbor_idx in self._layers[layer]: 389 | if vector_idx in self._layers[layer][neighbor_idx]: 390 | self._layers[layer][neighbor_idx].remove(vector_idx) 391 | 392 | logger.info("HNSW index optimization completed") 393 | 394 | def get_stats(self) -> Dict[str, Any]: 395 | """Get index statistics.""" 396 | stats = { 397 | "index_type": "hnsw", 398 | "dimension": self.dimension, 399 | "m": self.m, 400 | "ef_construction": self.ef_construction, 401 | "ef_search": self.ef_search, 402 | "max_elements": self.max_elements, 403 | "distance_metric": self.distance_metric, 404 | "num_vectors": len(self._vector_ids) if self._vector_ids else 0, 405 | "num_layers": len(self._layers), 406 | "entry_point": self._entry_point, 407 | "entry_layer": self._entry_layer 408 | } 409 | 410 | # Connection statistics 411 | if self._layers: 412 | total_connections = sum(len(connections) for connections in self._layers[0].values()) 413 | avg_connections = total_connections / len(self._layers[0]) if self._layers[0] else 0 414 | stats["total_connections"] = total_connections 415 | stats["avg_connections"] = avg_connections 416 | 417 | return stats 418 | 419 | 420 | class FlatIndex: 421 | """ 422 | Simple flat index for exact search (used for small datasets or testing). 423 | """ 424 | 425 | def __init__(self, dimension: int, distance_metric: str = "cosine"): 426 | self.dimension = dimension 427 | self.distance_metric = distance_metric 428 | self._vectors: Optional[np.ndarray] = None 429 | self._vector_ids: Optional[List[int]] = None 430 | self._distance_func = self._get_distance_function() 431 | 432 | def _get_distance_function(self): 433 | """Get the appropriate distance function.""" 434 | if self.distance_metric == "cosine": 435 | return self._cosine_distance 436 | elif self.distance_metric == "euclidean": 437 | return self._euclidean_distance 438 | elif self.distance_metric == "dot": 439 | return self._dot_distance 440 | else: 441 | raise ValueError(f"Unsupported distance metric: {self.distance_metric}") 442 | 443 | def _cosine_distance(self, a: np.ndarray, b: np.ndarray) -> float: 444 | """Compute cosine distance between vectors.""" 445 | dot_product = np.dot(a, b) 446 | norm_a = np.linalg.norm(a) 447 | norm_b = np.linalg.norm(b) 448 | if norm_a == 0 or norm_b == 0: 449 | return 1.0 450 | return 1.0 - (dot_product / (norm_a * norm_b)) 451 | 452 | def _euclidean_distance(self, a: np.ndarray, b: np.ndarray) -> float: 453 | """Compute Euclidean distance between vectors.""" 454 | return np.linalg.norm(a - b) 455 | 456 | def _dot_distance(self, a: np.ndarray, b: np.ndarray) -> float: 457 | """Compute negative dot product distance.""" 458 | return -np.dot(a, b) 459 | 460 | def build(self, vectors: np.ndarray, vector_ids: List[int]) -> None: 461 | """Build the flat index.""" 462 | self._vectors = vectors.astype(np.float32) 463 | self._vector_ids = vector_ids 464 | 465 | def search(self, query: np.ndarray, k: int) -> List[Tuple[int, float]]: 466 | """Search for k nearest neighbors using exact computation.""" 467 | if self._vectors is None: 468 | return [] 469 | 470 | # Compute distances to all vectors 471 | distances = [] 472 | for i, vector in enumerate(self._vectors): 473 | distance = self._distance_func(query, vector) 474 | distances.append((i, distance)) 475 | 476 | # Sort by distance and return top k 477 | distances.sort(key=lambda x: x[1]) 478 | 479 | results = [] 480 | for vector_idx, distance in distances[:k]: 481 | vector_id = self._vector_ids[vector_idx] 482 | results.append((vector_id, distance)) 483 | 484 | return results 485 | 486 | def search_batch(self, queries: np.ndarray, k: int) -> List[List[Tuple[int, float]]]: 487 | """Batch search for multiple queries.""" 488 | results = [] 489 | for query in queries: 490 | query_results = self.search(query, k) 491 | results.append(query_results) 492 | return results 493 | 494 | def optimize(self) -> None: 495 | """No optimization needed for flat index.""" 496 | pass 497 | 498 | def get_stats(self) -> Dict[str, Any]: 499 | """Get index statistics.""" 500 | return { 501 | "index_type": "flat", 502 | "dimension": self.dimension, 503 | "distance_metric": self.distance_metric, 504 | "num_vectors": len(self._vector_ids) if self._vector_ids else 0 505 | } 506 | -------------------------------------------------------------------------------- /octanedb/core.py: -------------------------------------------------------------------------------- 1 | """ 2 | OctaneDB Core - Main database interface. 3 | Provides high-level operations for vector database management. 4 | """ 5 | 6 | import logging 7 | from typing import Dict, List, Optional, Union, Any, Tuple 8 | from pathlib import Path 9 | import numpy as np 10 | 11 | from .collection import Collection 12 | from .storage import StorageManager 13 | from .embeddings import TextEmbedder, ChromaCompatibleEmbedder 14 | 15 | logger = logging.getLogger(__name__) 16 | 17 | 18 | class OctaneDB: 19 | """ 20 | OctaneDB - Lightning Fast Vector Database. 21 | 22 | A lightweight, high-performance vector database library that provides: 23 | - Fast vector similarity search using HNSW indexing 24 | - Automatic text embedding generation 25 | - ChromaDB-compatible API 26 | - Multiple storage modes (in-memory, persistent, hybrid) 27 | - Advanced metadata filtering 28 | - Batch operations for improved performance 29 | """ 30 | 31 | def __init__( 32 | self, 33 | dimension: int, 34 | index_type: str = "hnsw", 35 | m: int = 16, 36 | ef_construction: int = 200, 37 | ef_search: int = 100, 38 | max_elements: int = 1000000, 39 | distance_metric: str = "cosine", 40 | storage_path: Optional[Union[str, Path]] = None, 41 | embedding_model: Optional[str] = None, 42 | enable_text_embeddings: bool = True, 43 | **kwargs 44 | ): 45 | """ 46 | Initialize OctaneDB. 47 | 48 | Args: 49 | dimension: Vector dimension 50 | index_type: Type of index to use 51 | m: HNSW connections per layer 52 | ef_construction: Construction search depth 53 | ef_search: Search depth 54 | max_elements: Maximum number of vectors 55 | distance_metric: Distance metric for similarity 56 | storage_path: Path for persistent storage 57 | embedding_model: Sentence-transformers model for text embeddings 58 | enable_text_embeddings: Whether to enable text embedding functionality 59 | **kwargs: Additional arguments for collections 60 | """ 61 | self.dimension = dimension 62 | self.index_type = index_type 63 | self.m = m 64 | self.ef_construction = ef_construction 65 | self.ef_search = ef_search 66 | self.max_elements = max_elements 67 | self.distance_metric = distance_metric 68 | self.embedding_model = embedding_model 69 | self.enable_text_embeddings = enable_text_embeddings 70 | 71 | # Storage management 72 | self._storage_path = Path(storage_path) if storage_path else None 73 | self._storage_manager = StorageManager(storage_path=self._storage_path) if self._storage_path else None 74 | 75 | # Collection management 76 | self._collections: Dict[str, Collection] = {} 77 | self._current_collection: Optional[Collection] = None 78 | 79 | # Text embedding support 80 | self._text_embedder: Optional[TextEmbedder] = None 81 | if enable_text_embeddings and embedding_model: 82 | try: 83 | self._text_embedder = TextEmbedder(embedding_model) 84 | # Update dimension if it doesn't match 85 | if self._text_embedder.dimension != dimension: 86 | logger.warning(f"Embedding model dimension ({self._text_embedder.dimension}) " 87 | f"doesn't match specified dimension ({dimension})") 88 | self.dimension = self._text_embedder.dimension 89 | except ImportError: 90 | logger.warning("Text embeddings disabled: sentence-transformers not available") 91 | self._text_embedder = None 92 | 93 | # Performance tracking 94 | self._stats = { 95 | "collections_created": 0, 96 | "total_vectors": 0, 97 | "total_documents": 0, 98 | "operations_performed": 0 99 | } 100 | 101 | logger.info(f"OctaneDB initialized with dimension {self.dimension}") 102 | if self._text_embedder: 103 | logger.info(f"Text embeddings enabled with model: {embedding_model}") 104 | 105 | def create_collection( 106 | self, 107 | name: str, 108 | **kwargs 109 | ) -> Collection: 110 | """ 111 | Create a new collection. 112 | 113 | Args: 114 | name: Collection name 115 | **kwargs: Additional collection parameters 116 | 117 | Returns: 118 | Created collection instance 119 | """ 120 | if name in self._collections: 121 | raise ValueError(f"Collection '{name}' already exists") 122 | 123 | # Use embedding model from OctaneDB if not specified 124 | if "embedding_model" not in kwargs and self.embedding_model: 125 | kwargs["embedding_model"] = self.embedding_model 126 | 127 | # Create collection with text embedding support 128 | collection = Collection( 129 | name=name, 130 | dimension=self.dimension, 131 | index_type=self.index_type, 132 | m=self.m, 133 | ef_construction=self.ef_construction, 134 | ef_search=self.ef_search, 135 | max_elements=self.max_elements, 136 | distance_metric=self.distance_metric, 137 | enable_text_embeddings=self.enable_text_embeddings, 138 | **kwargs 139 | ) 140 | 141 | self._collections[name] = collection 142 | 143 | # Set as current collection if it's the first one 144 | if self._current_collection is None: 145 | self._current_collection = collection 146 | 147 | # Update stats 148 | self._stats["collections_created"] += 1 149 | 150 | logger.info(f"Collection '{name}' created successfully") 151 | return collection 152 | 153 | def use_collection(self, name: str) -> Collection: 154 | """ 155 | Set the current collection for operations. 156 | 157 | Args: 158 | name: Collection name 159 | 160 | Returns: 161 | Collection instance 162 | """ 163 | if name not in self._collections: 164 | raise ValueError(f"Collection '{name}' does not exist") 165 | 166 | self._current_collection = self._collections[name] 167 | logger.debug(f"Switched to collection: {name}") 168 | return self._current_collection 169 | 170 | def get_collection(self, name: str) -> Collection: 171 | """ 172 | Get a collection by name. 173 | 174 | Args: 175 | name: Collection name 176 | 177 | Returns: 178 | Collection instance 179 | """ 180 | if name not in self._collections: 181 | raise ValueError(f"Collection '{name}' does not exist") 182 | 183 | return self._collections[name] 184 | 185 | def list_collections(self) -> List[str]: 186 | """List all collection names.""" 187 | return list(self._collections.keys()) 188 | 189 | def delete_collection(self, name: str) -> bool: 190 | """ 191 | Delete a collection. 192 | 193 | Args: 194 | name: Collection name 195 | 196 | Returns: 197 | True if deleted, False if not found 198 | """ 199 | if name not in self._collections: 200 | return False 201 | 202 | # Clear current collection if it's the one being deleted 203 | if self._current_collection == self._collections[name]: 204 | self._current_collection = None 205 | 206 | # Delete collection 207 | del self._collections[name] 208 | 209 | # Set new current collection if needed 210 | if self._current_collection is None and self._collections: 211 | self._current_collection = list(self._collections.values())[0] 212 | 213 | logger.info(f"Collection '{name}' deleted") 214 | return True 215 | 216 | def insert( 217 | self, 218 | vectors: Union[np.ndarray, List], 219 | metadata: Optional[List[Dict[str, Any]]] = None, 220 | ids: Optional[List[int]] = None 221 | ) -> Union[int, List[int]]: 222 | """ 223 | Insert vectors into the current collection. 224 | 225 | Args: 226 | vectors: Vector(s) to insert 227 | metadata: Optional metadata for each vector 228 | ids: Optional custom IDs 229 | 230 | Returns: 231 | Inserted vector ID(s) 232 | """ 233 | if self._current_collection is None: 234 | raise RuntimeError("No current collection. Use create_collection() or use_collection() first.") 235 | 236 | result = self._current_collection.insert(vectors, metadata, ids) 237 | self._stats["operations_performed"] += 1 238 | self._update_total_stats() 239 | return result 240 | 241 | def add( 242 | self, 243 | ids: Optional[Union[str, List[str]]] = None, 244 | documents: Union[str, List[str]] = None, 245 | metadatas: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None, 246 | embeddings: Optional[Union[List[float], List[List[float]]]] = None 247 | ) -> Dict[str, Any]: 248 | """ 249 | ChromaDB-compatible add method for text documents. 250 | 251 | Args: 252 | ids: Document IDs (auto-generated if not provided) 253 | documents: Text documents to add 254 | metadatas: Optional metadata for documents 255 | embeddings: Pre-computed embeddings (optional) 256 | 257 | Returns: 258 | Dictionary with 'ids' and 'embeddings' keys 259 | """ 260 | if self._current_collection is None: 261 | raise RuntimeError("No current collection. Use create_collection() or use_collection() first.") 262 | 263 | result = self._current_collection.add(ids, documents, metadatas, embeddings) 264 | self._stats["operations_performed"] += 1 265 | self._update_total_stats() 266 | return result 267 | 268 | def add_text_documents( 269 | self, 270 | documents: Union[str, List[str]], 271 | ids: Optional[Union[str, List[str]]] = None, 272 | metadatas: Optional[Union[Dict[str, Any], List[Dict[str, Any]]]] = None, 273 | batch_size: int = 32, 274 | show_progress_bar: bool = False 275 | ) -> Dict[str, Any]: 276 | """ 277 | Add text documents with automatic embedding generation. 278 | 279 | Args: 280 | documents: Text document(s) to add 281 | ids: Document IDs (auto-generated if not provided) 282 | metadatas: Optional metadata for documents 283 | batch_size: Batch size for embedding generation 284 | show_progress_bar: Whether to show progress bar 285 | 286 | Returns: 287 | Dictionary with 'ids', 'embeddings', and 'documents' keys 288 | """ 289 | if self._current_collection is None: 290 | raise RuntimeError("No current collection. Use create_collection() or use_collection() first.") 291 | 292 | result = self._current_collection.add_text_documents( 293 | documents, ids, metadatas, batch_size, show_progress_bar 294 | ) 295 | self._stats["operations_performed"] += 1 296 | self._update_total_stats() 297 | return result 298 | 299 | def search( 300 | self, 301 | query_vector: np.ndarray, 302 | k: int = 10, 303 | filter: Optional[Dict[str, Any]] = None, 304 | include_metadata: bool = False 305 | ) -> List[Tuple[int, float, Optional[Dict[str, Any]]]]: 306 | """ 307 | Search for similar vectors in the current collection. 308 | 309 | Args: 310 | query_vector: Query vector 311 | k: Number of results to return 312 | filter: Optional metadata filter 313 | include_metadata: Whether to include metadata in results 314 | 315 | Returns: 316 | List of (id, distance, metadata) tuples 317 | """ 318 | if self._current_collection is None: 319 | raise RuntimeError("No current collection. Use create_collection() or use_collection() first.") 320 | 321 | result = self._current_collection.search(query_vector, k, filter, include_metadata) 322 | self._stats["operations_performed"] += 1 323 | return result 324 | 325 | def search_text( 326 | self, 327 | query_text: str, 328 | k: int = 10, 329 | filter: Optional[Dict[str, Any]] = None, 330 | include_metadata: bool = False 331 | ) -> List[Tuple[int, float, Optional[Dict[str, Any]]]]: 332 | """ 333 | Search for similar documents using text query. 334 | 335 | Args: 336 | query_text: Text query to search for 337 | k: Number of results to return 338 | filter: Optional metadata filter 339 | include_metadata: Whether to include metadata in results 340 | 341 | Returns: 342 | List of (id, distance, metadata) tuples 343 | """ 344 | if self._current_collection is None: 345 | raise RuntimeError("No current collection. Use create_collection() or use_collection() first.") 346 | 347 | result = self._current_collection.search_text(query_text, k, filter, include_metadata) 348 | self._stats["operations_performed"] += 1 349 | return result 350 | 351 | def search_batch( 352 | self, 353 | query_vectors: np.ndarray, 354 | k: int = 10, 355 | filter: Optional[Dict[str, Any]] = None, 356 | include_metadata: bool = False 357 | ) -> List[List[Tuple[int, float, Optional[Dict[str, Any]]]]]: 358 | """ 359 | Batch search for similar vectors. 360 | 361 | Args: 362 | query_vectors: Query vectors 363 | k: Number of results per query 364 | filter: Optional metadata filter 365 | include_metadata: Whether to include metadata in results 366 | 367 | Returns: 368 | List of result lists for each query 369 | """ 370 | if self._current_collection is None: 371 | raise RuntimeError("No current collection. Use create_collection() or use_collection() first.") 372 | 373 | result = self._current_collection.search_batch(query_vectors, k, filter, include_metadata) 374 | self._stats["operations_performed"] += 1 375 | return result 376 | 377 | def search_text_batch( 378 | self, 379 | query_texts: List[str], 380 | k: int = 10, 381 | filter: Optional[Dict[str, Any]] = None, 382 | include_metadata: bool = False, 383 | batch_size: int = 32 384 | ) -> List[List[Tuple[int, float, Optional[Dict[str, Any]]]]]: 385 | """ 386 | Batch search for similar documents using text queries. 387 | 388 | Args: 389 | query_texts: List of text queries 390 | k: Number of results per query 391 | filter: Optional metadata filter 392 | include_metadata: Whether to include metadata in results 393 | batch_size: Batch size for embedding generation 394 | 395 | Returns: 396 | List of result lists for each query 397 | """ 398 | if self._current_collection is None: 399 | raise RuntimeError("No current collection. Use create_collection() or use_collection() first.") 400 | 401 | result = self._current_collection.search_text_batch( 402 | query_texts, k, filter, include_metadata, batch_size 403 | ) 404 | self._stats["operations_performed"] += 1 405 | return result 406 | 407 | def update(self, id: int, vector: np.ndarray, metadata: Optional[Dict[str, Any]] = None) -> None: 408 | """ 409 | Update a vector in the current collection. 410 | 411 | Args: 412 | id: Vector ID to update 413 | vector: New vector 414 | metadata: New metadata 415 | """ 416 | if self._current_collection is None: 417 | raise RuntimeError("No current collection. Use create_collection() or use_collection() first.") 418 | 419 | self._current_collection.update(id, vector, metadata) 420 | self._stats["operations_performed"] += 1 421 | 422 | def delete(self, id: int) -> None: 423 | """ 424 | Delete a vector from the current collection. 425 | 426 | Args: 427 | id: Vector ID to delete 428 | """ 429 | if self._current_collection is None: 430 | raise RuntimeError("No current collection. Use create_collection() or use_collection() first.") 431 | 432 | self._current_collection.delete(id) 433 | self._stats["operations_performed"] += 1 434 | self._update_total_stats() 435 | 436 | def delete_batch(self, ids: List[int]) -> None: 437 | """ 438 | Batch delete vectors from the current collection. 439 | 440 | Args: 441 | ids: List of vector IDs to delete 442 | """ 443 | if self._current_collection is None: 444 | raise RuntimeError("No current collection. Use create_collection() or use_collection() first.") 445 | 446 | self._current_collection.delete_batch(ids) 447 | self._stats["operations_performed"] += 1 448 | self._update_total_stats() 449 | 450 | def get_vector(self, id: int, include_metadata: bool = False) -> Union[np.ndarray, Tuple[np.ndarray, Optional[Dict[str, Any]]]]: 451 | """ 452 | Get a vector from the current collection. 453 | 454 | Args: 455 | id: Vector ID 456 | include_metadata: Whether to include metadata 457 | 458 | Returns: 459 | Vector or (vector, metadata) tuple 460 | """ 461 | if self._current_collection is None: 462 | raise RuntimeError("No current collection. Use create_collection() or use_collection() first.") 463 | 464 | return self._current_collection.get_vector(id, include_metadata) 465 | 466 | def get_document(self, doc_id: str) -> Optional[str]: 467 | """ 468 | Get a text document from the current collection. 469 | 470 | Args: 471 | doc_id: Document ID 472 | 473 | Returns: 474 | Document text or None if not found 475 | """ 476 | if self._current_collection is None: 477 | raise RuntimeError("No current collection. Use create_collection() or use_collection() first.") 478 | 479 | return self._current_collection.get_document(doc_id) 480 | 481 | def list_documents(self) -> List[str]: 482 | """List all text document IDs in the current collection.""" 483 | if self._current_collection is None: 484 | raise RuntimeError("No current collection. Use create_collection() or use_collection() first.") 485 | 486 | return self._current_collection.list_documents() 487 | 488 | def delete_document(self, doc_id: str) -> bool: 489 | """ 490 | Delete a text document from the current collection. 491 | 492 | Args: 493 | doc_id: Document ID 494 | 495 | Returns: 496 | True if deleted, False if not found 497 | """ 498 | if self._current_collection is None: 499 | raise RuntimeError("No current collection. Use create_collection() or use_collection() first.") 500 | 501 | result = self._current_collection.delete_document(doc_id) 502 | if result: 503 | self._stats["operations_performed"] += 1 504 | self._update_total_stats() 505 | return result 506 | 507 | def clear_documents(self) -> None: 508 | """Clear all text documents from the current collection.""" 509 | if self._current_collection is None: 510 | raise RuntimeError("No current collection. Use create_collection() or use_collection() first.") 511 | 512 | self._current_collection.clear_documents() 513 | self._stats["operations_performed"] += 1 514 | self._update_total_stats() 515 | 516 | def change_embedding_model(self, model_name: str, **kwargs) -> None: 517 | """ 518 | Change the text embedding model for the current collection. 519 | 520 | Args: 521 | model_name: New sentence-transformers model name 522 | **kwargs: Additional arguments for TextEmbedder 523 | """ 524 | if self._current_collection is None: 525 | raise RuntimeError("No current collection. Use create_collection() or use_collection() first.") 526 | 527 | self._current_collection.change_embedding_model(model_name, **kwargs) 528 | # Update OctaneDB dimension if it changed 529 | if self._current_collection.dimension != self.dimension: 530 | self.dimension = self._current_collection.dimension 531 | logger.info(f"OctaneDB dimension updated to: {self.dimension}") 532 | 533 | def get_available_models(self) -> List[str]: 534 | """Get list of available sentence-transformers models.""" 535 | if not self._text_embedder: 536 | return [] 537 | return self._text_embedder.get_available_models() 538 | 539 | def get_stats(self) -> Dict[str, Any]: 540 | """Get comprehensive database statistics.""" 541 | stats = self._stats.copy() 542 | stats["collection_count"] = len(self._collections) 543 | stats["current_collection"] = self._current_collection.name if self._current_collection else None 544 | stats["text_embeddings_enabled"] = self._text_embedder is not None 545 | stats["embedding_model"] = self._text_embedder.model_name if self._text_embedder else None 546 | 547 | # Add collection-specific stats 548 | if self._current_collection: 549 | collection_stats = self._current_collection.get_stats() 550 | stats.update(collection_stats) 551 | 552 | return stats 553 | 554 | def get_collection_info(self, name: Optional[str] = None) -> Dict[str, Any]: 555 | """ 556 | Get information about a collection. 557 | 558 | Args: 559 | name: Collection name (uses current collection if None) 560 | 561 | Returns: 562 | Collection information dictionary 563 | """ 564 | if name is None: 565 | if not self._current_collection: 566 | raise RuntimeError("No current collection") 567 | return self._current_collection.get_collection_info() 568 | 569 | if name not in self._collections: 570 | raise ValueError(f"Collection '{name}' does not exist") 571 | 572 | return self._collections[name].get_collection_info() 573 | 574 | def _update_total_stats(self) -> None: 575 | """Update total statistics across all collections.""" 576 | total_vectors = 0 577 | total_documents = 0 578 | 579 | for collection in self._collections.values(): 580 | collection_stats = collection.get_stats() 581 | total_vectors += collection_stats.get("vector_count", 0) 582 | total_documents += collection_stats.get("document_count", 0) 583 | 584 | self._stats["total_vectors"] = total_vectors 585 | self._stats["total_documents"] = total_documents 586 | 587 | def save(self, path: Optional[Union[str, Path]] = None) -> None: 588 | """ 589 | Save the database to persistent storage. 590 | 591 | Args: 592 | path: Optional custom save path 593 | """ 594 | if not self._storage_manager: 595 | raise RuntimeError("Storage manager not initialized. Set storage_path during initialization.") 596 | 597 | save_path = Path(path) if path else self._storage_path 598 | self._storage_manager.save_database(self, save_path) 599 | logger.info(f"Database saved to: {save_path}") 600 | 601 | @classmethod 602 | def load(cls, path: str, **kwargs) -> "OctaneDB": 603 | """ 604 | Load a database from persistent storage. 605 | 606 | Args: 607 | path: Path to the saved database 608 | **kwargs: Additional initialization parameters 609 | 610 | Returns: 611 | Loaded OctaneDB instance 612 | """ 613 | path = Path(path) 614 | if not path.exists(): 615 | raise FileNotFoundError(f"Database file not found: {path}") 616 | 617 | storage_manager = StorageManager(storage_path=path.parent) 618 | metadata = storage_manager.load_database_metadata(path) 619 | 620 | # Get dimension from first collection 621 | collections_metadata = metadata.get("collections", {}) 622 | if not collections_metadata: 623 | raise ValueError("No collections found in database") 624 | 625 | first_collection_name = list(collections_metadata.keys())[0] 626 | first_collection_meta = collections_metadata[first_collection_name] 627 | 628 | # Create instance with loaded parameters 629 | instance = cls( 630 | dimension=first_collection_meta["dimension"], 631 | index_type=first_collection_meta.get("index_type", "hnsw"), 632 | m=first_collection_meta.get("m", 16), 633 | ef_construction=first_collection_meta.get("ef_construction", 200), 634 | ef_search=first_collection_meta.get("ef_search", 100), 635 | max_elements=first_collection_meta.get("max_elements", 1000000), 636 | distance_metric=first_collection_meta.get("distance_metric", "cosine"), 637 | storage_path=path.parent, 638 | **kwargs 639 | ) 640 | 641 | # Load collections 642 | storage_manager.load_database(instance, path) 643 | 644 | logger.info(f"Database loaded from: {path}") 645 | return instance 646 | 647 | def __enter__(self): 648 | """Context manager entry.""" 649 | return self 650 | 651 | def __exit__(self, exc_type, exc_val, exc_tb): 652 | """Context manager exit.""" 653 | if self._storage_manager: 654 | self.save() 655 | 656 | def __len__(self) -> int: 657 | """Return total number of vectors across all collections.""" 658 | return sum(len(collection) for collection in self._collections.values()) 659 | 660 | def __contains__(self, collection_name: str) -> bool: 661 | """Check if a collection exists.""" 662 | return collection_name in self._collections 663 | -------------------------------------------------------------------------------- /octanedb/collection.py: -------------------------------------------------------------------------------- 1 | """ 2 | Collection management for OctaneDB. 3 | Handles vector storage, indexing, and operations within a collection. 4 | """ 5 | 6 | import logging 7 | import time 8 | from typing import Dict, List, Optional, Union, Any, Tuple 9 | import numpy as np 10 | 11 | from .index import HNSWIndex 12 | from .embeddings import TextEmbedder, ChromaCompatibleEmbedder 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | class Collection: 18 | """ 19 | A collection of vectors with metadata and indexing capabilities. 20 | 21 | Features: 22 | - Vector storage and retrieval 23 | - Metadata management 24 | - Automatic indexing (HNSW) 25 | - Text embedding generation 26 | - ChromaDB-compatible API 27 | """ 28 | 29 | def __init__( 30 | self, 31 | name: str, 32 | dimension: int, 33 | index_type: str = "hnsw", 34 | m: int = 16, 35 | ef_construction: int = 200, 36 | ef_search: int = 100, 37 | max_elements: int = 1000000, 38 | distance_metric: str = "cosine", 39 | storage_manager=None, 40 | query_engine=None, 41 | vector_utils=None, 42 | embedding_model=None, 43 | enable_text_embeddings: bool = True 44 | ): 45 | """ 46 | Initialize a collection. 47 | 48 | Args: 49 | name: Collection name 50 | dimension: Vector dimension 51 | index_type: Type of index to use 52 | m: HNSW connections per layer 53 | ef_construction: Construction search depth 54 | ef_search: Search depth 55 | max_elements: Maximum number of vectors 56 | distance_metric: Distance metric for similarity 57 | storage_manager: Storage manager instance 58 | query_engine: Query engine instance 59 | vector_utils: Vector utilities instance 60 | embedding_model: Sentence-transformers model name for text embeddings 61 | enable_text_embeddings: Whether to enable text embedding functionality 62 | """ 63 | self.name = name 64 | self.dimension = dimension 65 | self.index_type = index_type 66 | self.m = m 67 | self.ef_construction = ef_construction 68 | self.ef_search = ef_search 69 | self.max_elements = max_elements 70 | self.distance_metric = distance_metric 71 | 72 | # Initialize components 73 | self._storage_manager = storage_manager 74 | self._query_engine = query_engine 75 | self._vector_utils = vector_utils 76 | 77 | # Vector storage 78 | self._vectors: Dict[int, np.ndarray] = {} 79 | self._metadata: Dict[int, Dict[str, Any]] = {} 80 | self._next_id = 0 81 | 82 | # Text document storage 83 | self._documents: Dict[int, str] = {} 84 | self._text_embedder = None 85 | 86 | # Initialize text embeddings if enabled 87 | if enable_text_embeddings and embedding_model: 88 | try: 89 | self._text_embedder = TextEmbedder(embedding_model) 90 | # Update dimension if it doesn't match 91 | if self._text_embedder.dimension != dimension: 92 | logger.warning(f"Embedding model dimension ({self._text_embedder.dimension}) " 93 | f"doesn't match collection dimension ({dimension})") 94 | self.dimension = self._text_embedder.dimension 95 | except ImportError: 96 | logger.warning("Text embeddings disabled: sentence-transformers not available") 97 | self._text_embedder = None 98 | 99 | # Index management 100 | self._index: Optional[HNSWIndex] = None 101 | self._index_built = False 102 | self._index_needs_rebuild = False 103 | 104 | # Performance tracking 105 | self._stats = { 106 | "inserts": 0, 107 | "searches": 0, 108 | "updates": 0, 109 | "deletes": 0, 110 | "text_documents": 0, 111 | "index_builds": 0, 112 | "last_index_build": None 113 | } 114 | 115 | # Initialize index 116 | self._init_index() 117 | 118 | logger.info(f"Collection '{name}' initialized with dimension {dimension}") 119 | 120 | def _init_index(self) -> None: 121 | """Initialize the vector index.""" 122 | if self.index_type == "hnsw": 123 | self._index = HNSWIndex( 124 | dimension=self.dimension, 125 | m=self.m, 126 | ef_construction=self.ef_construction, 127 | ef_search=self.ef_search, 128 | max_elements=self.max_elements, 129 | distance_metric=self.distance_metric 130 | ) 131 | else: 132 | raise ValueError(f"Unsupported index type: {self.index_type}") 133 | 134 | def insert( 135 | self, 136 | vectors, 137 | metadata=None, 138 | ids=None 139 | ): 140 | """ 141 | Insert vectors into the collection. 142 | 143 | Args: 144 | vectors: Vector(s) to insert 145 | metadata: Optional metadata for each vector 146 | ids: Optional custom IDs 147 | 148 | Returns: 149 | Inserted vector ID(s) 150 | """ 151 | # Convert to numpy array if needed 152 | if isinstance(vectors, list): 153 | vectors = np.array(vectors, dtype=np.float32) 154 | elif not isinstance(vectors, np.ndarray): 155 | vectors = np.array([vectors], dtype=np.float32) 156 | 157 | # Ensure 2D array 158 | if vectors.ndim == 1: 159 | vectors = vectors.reshape(1, -1) 160 | 161 | # Validate dimensions 162 | if vectors.shape[1] != self.dimension: 163 | raise ValueError(f"Vector dimension {vectors.shape[1]} does not match collection dimension {self.dimension}") 164 | 165 | # Handle metadata 166 | if metadata is None: 167 | metadata = [{} for _ in range(len(vectors))] 168 | elif not isinstance(metadata, list): 169 | metadata = [metadata] 170 | 171 | # Handle IDs 172 | if ids is None: 173 | ids = [self._next_id + i for i in range(len(vectors))] 174 | elif not isinstance(ids, list): 175 | ids = [ids] 176 | 177 | # Validate lengths 178 | if len(vectors) != len(metadata) or len(vectors) != len(ids): 179 | raise ValueError("Vectors, metadata, and IDs must have the same length") 180 | 181 | # Insert vectors 182 | inserted_ids = [] 183 | for i, (vector, meta, vector_id) in enumerate(zip(vectors, metadata, ids)): 184 | # Check if ID already exists 185 | if vector_id in self._vectors: 186 | raise ValueError(f"Vector ID {vector_id} already exists") 187 | 188 | # Store vector and metadata 189 | self._vectors[vector_id] = vector.copy() 190 | self._metadata[vector_id] = meta.copy() 191 | inserted_ids.append(vector_id) 192 | 193 | # Update next_id (only for integer IDs) 194 | if isinstance(vector_id, int): 195 | self._next_id = max(self._next_id, vector_id + 1) 196 | 197 | # Mark index for rebuild 198 | self._index_needs_rebuild = True 199 | 200 | # Update stats 201 | self._stats["inserts"] += len(vectors) 202 | 203 | logger.debug(f"Inserted {len(vectors)} vectors into collection '{self.name}'") 204 | 205 | # Return single ID or list based on input 206 | return inserted_ids[0] if len(inserted_ids) == 1 else inserted_ids 207 | 208 | def add_text_documents( 209 | self, 210 | documents, 211 | ids=None, 212 | metadatas=None, 213 | batch_size: int = 32, 214 | show_progress_bar: bool = False 215 | ): 216 | """ 217 | Add text documents with automatic embedding generation (ChromaDB-compatible). 218 | 219 | Args: 220 | documents: Text document(s) to add 221 | ids: Document IDs (auto-generated if not provided) 222 | metadatas: Optional metadata for documents 223 | batch_size: Batch size for embedding generation 224 | show_progress_bar: Whether to show progress bar 225 | 226 | Returns: 227 | Dictionary with 'ids', 'embeddings', and 'documents' keys 228 | """ 229 | if not self._text_embedder: 230 | raise RuntimeError("Text embeddings not enabled. Set embedding_model during collection creation.") 231 | 232 | # Ensure documents is a list 233 | if isinstance(documents, str): 234 | documents = [documents] 235 | 236 | if not documents: 237 | raise ValueError("No documents provided") 238 | 239 | num_docs = len(documents) 240 | 241 | # Handle IDs 242 | if ids is None: 243 | # Auto-generate string IDs 244 | ids = [f"doc_{self._next_id + i}" for i in range(num_docs)] 245 | elif isinstance(ids, str): 246 | ids = [ids] 247 | 248 | if len(ids) != num_docs: 249 | raise ValueError("Number of IDs must match number of documents") 250 | 251 | # Handle metadata 252 | if metadatas is None: 253 | metadatas = [{} for _ in range(num_docs)] 254 | elif isinstance(metadatas, dict): 255 | metadatas = [metadatas for _ in range(num_docs)] 256 | 257 | if len(metadatas) != num_docs: 258 | raise ValueError("Number of metadatas must match number of documents") 259 | 260 | # Generate embeddings 261 | logger.info(f"Generating embeddings for {num_docs} documents...") 262 | embeddings = self._text_embedder.embed_texts( 263 | documents, 264 | batch_size=batch_size, 265 | show_progress_bar=show_progress_bar 266 | ) 267 | 268 | # Store documents 269 | for doc_id, doc_text in zip(ids, documents): 270 | self._documents[doc_id] = doc_text 271 | 272 | # Insert vectors into collection 273 | vector_ids = self.insert( 274 | vectors=embeddings, 275 | metadata=metadatas, 276 | ids=ids 277 | ) 278 | 279 | # Update stats 280 | self._stats["text_documents"] += num_docs 281 | 282 | # Prepare result 283 | result = { 284 | "ids": ids, 285 | "embeddings": embeddings, 286 | "documents": documents, 287 | "metadatas": metadatas, 288 | "vector_ids": vector_ids 289 | } 290 | 291 | logger.info(f"Added {num_docs} text documents with IDs: {ids}") 292 | return result 293 | 294 | def add( 295 | self, 296 | ids=None, 297 | documents=None, 298 | metadatas=None, 299 | embeddings=None 300 | ): 301 | """ 302 | ChromaDB-compatible add method for text documents. 303 | 304 | Args: 305 | ids: Document IDs (auto-generated if not provided) 306 | documents: Text documents to add 307 | metadatas: Optional metadata for documents 308 | embeddings: Pre-computed embeddings (optional) 309 | 310 | Returns: 311 | Dictionary with 'ids' and 'embeddings' keys 312 | """ 313 | if documents is not None: 314 | # Use text document processing 315 | return self.add_text_documents(documents, ids, metadatas) 316 | elif embeddings is not None: 317 | # Use pre-computed embeddings 318 | if isinstance(embeddings, list) and isinstance(embeddings[0], (int, float)): 319 | embeddings = [embeddings] 320 | embeddings = np.array(embeddings, dtype=np.float32) 321 | 322 | # Handle IDs 323 | if ids is None: 324 | ids = [f"vec_{self._next_id + i}" for i in range(len(embeddings))] 325 | elif isinstance(ids, str): 326 | ids = [ids] 327 | 328 | # Handle metadata 329 | if metadatas is None: 330 | metadatas = [{} for _ in range(len(embeddings))] 331 | elif isinstance(metadatas, dict): 332 | metadatas = [metadatas for _ in range(len(embeddings))] 333 | 334 | # Insert vectors 335 | vector_ids = self.insert(vectors=embeddings, metadata=metadatas, ids=ids) 336 | 337 | return { 338 | "ids": ids, 339 | "embeddings": embeddings, 340 | "vector_ids": vector_ids 341 | } 342 | else: 343 | raise ValueError("Either 'documents' or 'embeddings' must be provided") 344 | 345 | def get_document(self, doc_id: str): 346 | """Get a text document by ID.""" 347 | return self._documents.get(doc_id) 348 | 349 | def list_documents(self): 350 | """List all text document IDs.""" 351 | return list(self._documents.keys()) 352 | 353 | def delete_document(self, doc_id: str) -> bool: 354 | """Delete a text document by ID.""" 355 | if doc_id in self._documents: 356 | del self._documents[doc_id] 357 | # Also delete the corresponding vector if it exists 358 | if doc_id in self._vectors: 359 | del self._vectors[doc_id] 360 | if doc_id in self._metadata: 361 | del self._metadata[doc_id] 362 | logger.info(f"Deleted document: {doc_id}") 363 | return True 364 | return False 365 | 366 | def clear_documents(self): 367 | """Clear all text documents and vectors.""" 368 | self._documents.clear() 369 | self._vectors.clear() 370 | self._metadata.clear() 371 | self._next_id = 0 372 | self._index_built = False 373 | self._index_needs_rebuild = False 374 | logger.info("Cleared all documents and vectors") 375 | 376 | def get_collection_info(self): 377 | """Get comprehensive information about the collection.""" 378 | return { 379 | "name": self.name, 380 | "dimension": self.dimension, 381 | "num_vectors": len(self._vectors), 382 | "num_documents": len(self._documents), 383 | "index_type": self.index_type, 384 | "distance_metric": self.distance_metric, 385 | "index_built": self._index_built, 386 | "text_embeddings_enabled": self._text_embedder is not None, 387 | "embedding_model": self._text_embedder.model_name if self._text_embedder else None, 388 | "stats": self._stats.copy() 389 | } 390 | 391 | def change_embedding_model(self, model_name: str, **kwargs): 392 | """ 393 | Change the text embedding model. 394 | 395 | Args: 396 | model_name: New sentence-transformers model name 397 | **kwargs: Additional arguments for TextEmbedder 398 | """ 399 | if not self._text_embedder: 400 | raise RuntimeError("Text embeddings not enabled") 401 | 402 | old_dimension = self.dimension 403 | self._text_embedder.change_model(model_name, **kwargs) 404 | 405 | # Update collection dimension if it changed 406 | if self._text_embedder.dimension != old_dimension: 407 | logger.warning(f"Embedding model dimension changed from {old_dimension} to {self._text_embedder.dimension}") 408 | self.dimension = self._text_embedder.dimension 409 | # Rebuild index with new dimension 410 | self._index_needs_rebuild = True 411 | 412 | logger.info(f"Embedding model changed to: {model_name}") 413 | 414 | def search( 415 | self, 416 | query_vector, 417 | k: int = 10, 418 | filter=None, 419 | include_metadata: bool = False 420 | ): 421 | """ 422 | Search for similar vectors. 423 | 424 | Args: 425 | query_vector: Query vector 426 | k: Number of results to return 427 | filter: Optional metadata filter 428 | include_metadata: Whether to include metadata in results 429 | 430 | Returns: 431 | List of (id, distance, metadata) tuples 432 | """ 433 | # Ensure index is built 434 | if not self._index_built or self._index_needs_rebuild: 435 | self._build_index() 436 | 437 | # Validate query vector 438 | if query_vector.shape[0] != self.dimension: 439 | raise ValueError(f"Query vector dimension {query_vector.shape[0]} does not match collection dimension {self.dimension}") 440 | 441 | # Search using index 442 | start_time = time.time() 443 | results = self._index.search(query_vector, k) 444 | search_time = time.time() - start_time 445 | 446 | # Apply filters if specified 447 | if filter: 448 | results = self._apply_filter(results, filter) 449 | 450 | # Format results 451 | formatted_results = [] 452 | for vector_id, distance in results: 453 | metadata = self._metadata.get(vector_id) if include_metadata else None 454 | formatted_results.append((vector_id, distance, metadata)) 455 | 456 | # Update stats 457 | self._stats["searches"] += 1 458 | 459 | logger.debug(f"Search completed in {search_time:.4f}s, found {len(formatted_results)} results") 460 | 461 | return formatted_results 462 | 463 | def search_text( 464 | self, 465 | query_text: str, 466 | k: int = 10, 467 | filter=None, 468 | include_metadata: bool = False 469 | ): 470 | """ 471 | Search for similar documents using text query. 472 | 473 | Args: 474 | query_text: Text query to search for 475 | k: Number of results to return 476 | filter: Optional metadata filter 477 | include_metadata: Whether to include metadata in results 478 | 479 | Returns: 480 | List of (id, distance, metadata) tuples 481 | """ 482 | if not self._text_embedder: 483 | raise RuntimeError("Text embeddings not enabled") 484 | 485 | # Generate embedding for query text 486 | query_embedding = self._text_embedder.embed_texts(query_text)[0] 487 | 488 | # Search using the embedding 489 | return self.search( 490 | query_vector=query_embedding, 491 | k=k, 492 | filter=filter, 493 | include_metadata=include_metadata 494 | ) 495 | 496 | def search_batch( 497 | self, 498 | query_vectors, 499 | k: int = 10, 500 | filter=None, 501 | include_metadata: bool = False 502 | ): 503 | """ 504 | Batch search for similar vectors. 505 | 506 | Args: 507 | query_vectors: Query vectors 508 | k: Number of results per query 509 | filter: Optional metadata filter 510 | include_metadata: Whether to include metadata in results 511 | 512 | Returns: 513 | List of result lists for each query 514 | """ 515 | # Ensure index is built 516 | if not self._index_built or self._index_needs_rebuild: 517 | self._build_index() 518 | 519 | # Validate query vectors 520 | if query_vectors.ndim == 1: 521 | query_vectors = query_vectors.reshape(1, -1) 522 | 523 | if query_vectors.shape[1] != self.dimension: 524 | raise ValueError(f"Query vector dimension {query_vectors.shape[1]} does not match collection dimension {self.dimension}") 525 | 526 | # Batch search using index 527 | start_time = time.time() 528 | batch_results = self._index.search_batch(query_vectors, k) 529 | search_time = time.time() - start_time 530 | 531 | # Apply filters and format results 532 | formatted_batch_results = [] 533 | for results in batch_results: 534 | if filter: 535 | results = self._apply_filter(results, filter) 536 | 537 | formatted_results = [] 538 | for vector_id, distance in results: 539 | metadata = self._metadata.get(vector_id) if include_metadata else None 540 | formatted_results.append((vector_id, distance, metadata)) 541 | 542 | formatted_batch_results.append(formatted_results) 543 | 544 | # Update stats 545 | self._stats["searches"] += 1 546 | 547 | logger.debug(f"Batch search completed in {search_time:.4f}s for {len(query_vectors)} queries") 548 | 549 | return formatted_batch_results 550 | 551 | def search_text_batch( 552 | self, 553 | query_texts, 554 | k: int = 10, 555 | filter=None, 556 | include_metadata: bool = False, 557 | batch_size: int = 32 558 | ): 559 | """ 560 | Batch search for similar documents using text queries. 561 | 562 | Args: 563 | query_texts: List of text queries 564 | k: Number of results per query 565 | filter: Optional metadata filter 566 | include_metadata: Whether to include metadata in results 567 | batch_size: Batch size for embedding generation 568 | 569 | Returns: 570 | List of result lists for each query 571 | """ 572 | if not self._text_embedder: 573 | raise RuntimeError("Text embeddings not enabled") 574 | 575 | # Generate embeddings for all query texts 576 | query_embeddings = self._text_embedder.embed_texts( 577 | query_texts, 578 | batch_size=batch_size 579 | ) 580 | 581 | # Search using the embeddings 582 | return self.search_batch( 583 | query_vectors=query_embeddings, 584 | k=k, 585 | filter=filter, 586 | include_metadata=include_metadata 587 | ) 588 | 589 | def _apply_filter(self, results, filter): 590 | """ 591 | Apply metadata filter to search results. 592 | 593 | Args: 594 | results: Search results (id, distance) tuples 595 | filter: Filter criteria 596 | 597 | Returns: 598 | Filtered results 599 | """ 600 | if not self._query_engine: 601 | logger.warning("Query engine not available, returning unfiltered results") 602 | return results 603 | 604 | # Apply filter using query engine 605 | filtered_results = [] 606 | for vector_id, distance in results: 607 | metadata = self._metadata.get(vector_id, {}) 608 | if self._query_engine.evaluate(metadata, filter): 609 | filtered_results.append((vector_id, distance)) 610 | 611 | return filtered_results 612 | 613 | def _build_index(self) -> None: 614 | """Build or rebuild the vector index.""" 615 | if not self._vectors: 616 | logger.warning("No vectors to index") 617 | return 618 | 619 | start_time = time.time() 620 | 621 | # Convert vectors to array 622 | vector_ids = list(self._vectors.keys()) 623 | vectors_array = np.array([self._vectors[vid] for vid in vector_ids], dtype=np.float32) 624 | 625 | # Build index 626 | self._index.build(vectors_array, vector_ids) 627 | 628 | # Update status 629 | self._index_built = True 630 | self._index_needs_rebuild = False 631 | 632 | build_time = time.time() - start_time 633 | 634 | # Update stats 635 | self._stats["index_builds"] += 1 636 | self._stats["last_index_build"] = time.time() 637 | 638 | logger.info(f"Index built for {len(vectors_array)} vectors in {build_time:.4f}s") 639 | 640 | def update(self, id: int, vector, metadata=None) -> None: 641 | """ 642 | Update a vector. 643 | 644 | Args: 645 | id: Vector ID to update 646 | vector: New vector 647 | metadata: New metadata 648 | """ 649 | if id not in self._vectors: 650 | raise ValueError(f"Vector ID {id} does not exist") 651 | 652 | # Validate vector dimension 653 | if vector.shape[0] != self.dimension: 654 | raise ValueError(f"Vector dimension {vector.shape[0]} does not match collection dimension {self.dimension}") 655 | 656 | # Update vector and metadata 657 | self._vectors[id] = vector.copy() 658 | if metadata is not None: 659 | self._metadata[id] = metadata.copy() 660 | 661 | # Mark index for rebuild 662 | self._index_needs_rebuild = True 663 | 664 | # Update stats 665 | self._stats["updates"] += 1 666 | 667 | logger.debug(f"Updated vector {id} in collection '{self.name}'") 668 | 669 | def delete(self, id: int) -> None: 670 | """ 671 | Delete a vector. 672 | 673 | Args: 674 | id: Vector ID to delete 675 | """ 676 | if id not in self._vectors: 677 | raise ValueError(f"Vector ID {id} does not exist") 678 | 679 | # Remove vector and metadata 680 | del self._vectors[id] 681 | if id in self._metadata: 682 | del self._metadata[id] 683 | 684 | # Mark index for rebuild 685 | self._index_needs_rebuild = True 686 | 687 | # Update stats 688 | self._stats["deletes"] += 1 689 | 690 | logger.debug(f"Deleted vector {id} from collection '{self.name}'") 691 | 692 | def delete_batch(self, ids) -> None: 693 | """ 694 | Batch delete vectors. 695 | 696 | Args: 697 | ids: List of vector IDs to delete 698 | """ 699 | for vector_id in ids: 700 | self.delete(vector_id) 701 | 702 | def get_vector(self, id: int, include_metadata: bool = False): 703 | """ 704 | Get a vector by ID. 705 | 706 | Args: 707 | id: Vector ID 708 | include_metadata: Whether to include metadata 709 | 710 | Returns: 711 | Vector or (vector, metadata) tuple 712 | """ 713 | if id not in self._vectors: 714 | raise ValueError(f"Vector ID {id} does not exist") 715 | 716 | vector = self._vectors[id] 717 | if include_metadata: 718 | metadata = self._metadata.get(id) 719 | return vector, metadata 720 | else: 721 | return vector 722 | 723 | def count(self) -> int: 724 | """Get total number of vectors in the collection.""" 725 | return len(self._vectors) 726 | 727 | def get_stats(self): 728 | """Get collection statistics.""" 729 | stats = self._stats.copy() 730 | stats["vector_count"] = len(self._vectors) 731 | stats["metadata_count"] = len(self._metadata) 732 | stats["document_count"] = len(self._documents) 733 | stats["index_built"] = self._index_built 734 | stats["index_needs_rebuild"] = self._index_needs_rebuild 735 | return stats 736 | 737 | def optimize_index(self) -> None: 738 | """Optimize the collection's index.""" 739 | if not self._index_built: 740 | logger.warning("Index not built yet") 741 | return 742 | 743 | start_time = time.time() 744 | self._index.optimize() 745 | optimize_time = time.time() - start_time 746 | 747 | logger.info(f"Index optimization completed in {optimize_time:.4f}s") 748 | 749 | def clear(self) -> None: 750 | """Clear all vectors from the collection.""" 751 | self._vectors.clear() 752 | self._metadata.clear() 753 | self._documents.clear() 754 | self._next_id = 0 755 | self._index_built = False 756 | self._index_needs_rebuild = False 757 | 758 | logger.info(f"Collection '{self.name}' cleared") 759 | 760 | def __len__(self) -> int: 761 | """Return number of vectors in the collection.""" 762 | return len(self._vectors) 763 | 764 | def __contains__(self, id: int) -> bool: 765 | """Check if a vector ID exists in the collection.""" 766 | return id in self._vectors 767 | --------------------------------------------------------------------------------