├── raggy.png
├── raggy
    ├── cli
    │   ├── __init__.py
    │   ├── base.py
    │   └── factory.py
    ├── config
    │   ├── __init__.py
    │   ├── cache.py
    │   ├── loader.py
    │   ├── constants.py
    │   └── raggy_config.py
    ├── setup
    │   ├── __init__.py
    │   └── dependencies.py
    ├── query
    │   ├── __init__.py
    │   └── processor.py
    ├── scoring
    │   ├── __init__.py
    │   ├── normalization.py
    │   └── bm25.py
    ├── utils
    │   ├── __init__.py
    │   ├── patterns.py
    │   ├── symbols.py
    │   ├── security.py
    │   ├── logging.py
    │   └── updates.py
    ├── embeddings
    │   ├── __init__.py
    │   ├── provider.py
    │   ├── factory.py
    │   ├── sentence_transformers_provider.py
    │   └── openai_provider.py
    ├── core
    │   ├── __init__.py
    │   ├── database.py
    │   ├── vector_store_factory.py
    │   └── database_interface.py
    └── __init__.py
├── .claude
    └── settings.local.json
├── requirements-dev.txt
├── .gitignore
├── LICENSE
├── .pre-commit-config.yaml
├── docs
    ├── artifacts
    │   └── QUALITY_VIOLATIONS.csv
    ├── configuration.md
    ├── setup-guide.md
    └── vector-databases.md
├── .raggy.json.example
├── CHANGELOG.md
├── pyproject.toml
├── .github
    └── workflows
    │   └── test.yml
├── raggy.py
├── tests
    ├── test_memory_api.py
    ├── test_bm25.py
    ├── conftest.py
    └── test_query_processor.py
├── README.md
└── raggy_cli.py


/raggy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dimitritholen/raggy/HEAD/raggy.png


--------------------------------------------------------------------------------
/raggy/cli/__init__.py:
--------------------------------------------------------------------------------
1 | """Command-line interface for the RAG system."""
2 | 


--------------------------------------------------------------------------------
/raggy/config/__init__.py:
--------------------------------------------------------------------------------
1 | """Configuration management and constants."""
2 | 


--------------------------------------------------------------------------------
/raggy/setup/__init__.py:
--------------------------------------------------------------------------------
1 | """Environment setup and dependency management."""
2 | 


--------------------------------------------------------------------------------
/raggy/query/__init__.py:
--------------------------------------------------------------------------------
1 | """Query processing and expansion functionality."""
2 | 


--------------------------------------------------------------------------------
/raggy/scoring/__init__.py:
--------------------------------------------------------------------------------
1 | """Scoring and normalization functions for search results."""
2 | 


--------------------------------------------------------------------------------
/raggy/utils/__init__.py:
--------------------------------------------------------------------------------
1 | """Utility functions for logging, security, and other cross-cutting concerns."""
2 | 


--------------------------------------------------------------------------------
/.claude/settings.local.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "permissions": {
 3 |     "allow": [
 4 |       "Bash(git add:*)",
 5 |       "Bash(git commit:*)",
 6 |       "Bash(python:*)",
 7 |       "mcp__sequential-thinking__sequentialthinking",
 8 |       "mcp__ucpl-compress__compress_code_context",
 9 |       "Bash(fd:*)",
10 |       "Bash(ruff check:*)",
11 |       "Bash(mypy:*)",
12 |       "Bash(pytest:*)"
13 |     ],
14 |     "deny": [],
15 |     "ask": []
16 |   }
17 | }
18 | 


--------------------------------------------------------------------------------
/raggy/cli/base.py:
--------------------------------------------------------------------------------
 1 | """Base command interface for CLI."""
 2 | 
 3 | from typing import Any, Optional
 4 | 
 5 | 
 6 | class Command:
 7 |     """Base command interface."""
 8 | 
 9 |     def execute(self, args: Any, rag: Optional[Any] = None) -> None:
10 |         """Execute the command.
11 | 
12 |         Args:
13 |             args: Command line arguments
14 |             rag: UniversalRAG instance (optional for some commands)
15 | 
16 |         """
17 |         raise NotImplementedError
18 | 


--------------------------------------------------------------------------------
/raggy/utils/patterns.py:
--------------------------------------------------------------------------------
 1 | """Pre-compiled regex patterns for performance."""
 2 | 
 3 | import re
 4 | 
 5 | # Text processing patterns
 6 | WORD_PATTERN = re.compile(r"\b\w+\b")
 7 | NEGATIVE_TERM_PATTERN = re.compile(r"-\w+")
 8 | AND_TERM_PATTERN = re.compile(r"\w+(?=\s+AND)", re.IGNORECASE)
 9 | QUOTED_PHRASE_PATTERN = re.compile(r'"([^"]+)"')
10 | 
11 | # Document structure patterns
12 | HEADER_PATTERN = re.compile(r"(^#{1,6}\s+.*$)", re.MULTILINE)
13 | SENTENCE_BOUNDARY_PATTERN = re.compile(r"[.!?\n]")
14 | 


--------------------------------------------------------------------------------
/raggy/embeddings/__init__.py:
--------------------------------------------------------------------------------
 1 | """Embedding providers for Raggy.
 2 | 
 3 | This module provides a pluggable embedding provider system supporting
 4 | both local models (sentence-transformers) and cloud APIs (OpenAI).
 5 | """
 6 | 
 7 | from .factory import create_embedding_provider
 8 | from .openai_provider import OpenAIProvider
 9 | from .provider import EmbeddingProvider
10 | from .sentence_transformers_provider import SentenceTransformersProvider
11 | 
12 | __all__ = [
13 |     "EmbeddingProvider",
14 |     "SentenceTransformersProvider",
15 |     "OpenAIProvider",
16 |     "create_embedding_provider",
17 | ]
18 | 


--------------------------------------------------------------------------------
/raggy/core/__init__.py:
--------------------------------------------------------------------------------
 1 | """Core business logic for the RAG system."""
 2 | 
 3 | from .chromadb_adapter import ChromaCollection, ChromaDBAdapter
 4 | from .database import DatabaseManager
 5 | from .database_interface import Collection, VectorDatabase
 6 | from .document import DocumentProcessor
 7 | from .rag import UniversalRAG
 8 | from .search import SearchEngine
 9 | 
10 | __all__ = [
11 |     # Main components
12 |     "UniversalRAG",
13 |     "DatabaseManager",
14 |     "DocumentProcessor",
15 |     "SearchEngine",
16 |     # Database interfaces
17 |     "VectorDatabase",
18 |     "Collection",
19 |     # Database implementations
20 |     "ChromaDBAdapter",
21 |     "ChromaCollection",
22 | ]
23 | 


--------------------------------------------------------------------------------
/raggy/utils/symbols.py:
--------------------------------------------------------------------------------
 1 | """Cross-platform emoji/symbol support."""
 2 | 
 3 | from typing import Dict
 4 | 
 5 | 
 6 | def get_symbols() -> Dict[str, str]:
 7 |     """Get appropriate symbols based on platform/terminal support.
 8 | 
 9 |     Returns:
10 |         Dict[str, str]: Dictionary of symbol names to their display representations
11 | 
12 |     """
13 |     try:
14 |         # Test if terminal supports unicode
15 |         test = "🔍"
16 |         print(test, end="")
17 |         print("\b \b", end="")  # backspace and clear
18 |         return {
19 |             "search": "🔍",
20 |             "found": "📋",
21 |             "success": "✅",
22 |             "bye": "👋"
23 |         }
24 |     except UnicodeEncodeError:
25 |         return {
26 |             "search": "[Search]",
27 |             "found": "[Found]",
28 |             "success": "[Success]",
29 |             "bye": "[Bye]",
30 |         }
31 | 
32 | 
33 | # Initialize symbols once
34 | SYMBOLS = get_symbols()
35 | 


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
 1 | # Development dependencies for raggy
 2 | # Core dependencies (also needed for production)
 3 | chromadb>=0.4.0
 4 | sentence-transformers>=2.2.0
 5 | PyPDF2>=3.0.0
 6 | python-docx>=1.0.0
 7 | 
 8 | # Optional dependencies for better functionality
 9 | PyYAML>=6.0
10 | python-magic-bin>=0.4.14;platform_system=="Windows"
11 | python-magic;platform_system!="Windows"
12 | 
13 | # Testing
14 | pytest>=7.0.0
15 | pytest-cov>=4.0.0
16 | pytest-mock>=3.10.0
17 | pytest-xdist>=3.0.0  # parallel testing
18 | 
19 | # Code quality
20 | ruff>=0.1.0          # linting and formatting
21 | mypy>=1.5.0          # type checking
22 | types-PyYAML         # type stubs for PyYAML
23 | 
24 | # Security
25 | bandit>=1.7.0        # security linting
26 | safety>=2.3.0        # dependency vulnerability scanning
27 | 
28 | # Performance testing
29 | pytest-benchmark>=4.0.0
30 | 
31 | # Documentation (optional)
32 | sphinx>=7.0.0
33 | sphinx-rtd-theme>=1.3.0
34 | myst-parser>=2.0.0   # for markdown in docs


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Python
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | *.so
 6 | .Python
 7 | build/
 8 | develop-eggs/
 9 | dist/
10 | downloads/
11 | eggs/
12 | .eggs/
13 | lib/
14 | lib64/
15 | parts/
16 | sdist/
17 | var/
18 | wheels/
19 | *.egg-info/
20 | .installed.cfg
21 | *.egg
22 | MANIFEST
23 | 
24 | # Virtual environments
25 | .env
26 | .venv
27 | env/
28 | venv/
29 | ENV/
30 | env.bak/
31 | venv.bak/
32 | 
33 | # IDEs
34 | .vscode/
35 | .idea/
36 | *.swp
37 | *.swo
38 | *~
39 | 
40 | # Testing
41 | .coverage
42 | .pytest_cache/
43 | .tox/
44 | htmlcov/
45 | .coverage.*
46 | coverage.xml
47 | *.cover
48 | .hypothesis/
49 | 
50 | # Raggy specific
51 | vectordb/
52 | docs/
53 | raggy_config.yaml
54 | raggy_config_example.yaml
55 | .raggy_deps_cache.json
56 | .raggy.json
57 | *.backup
58 | 
59 | # OS specific
60 | .DS_Store
61 | .DS_Store?
62 | ._*
63 | .Spotlight-V100
64 | .Trashes
65 | ehthumbs.db
66 | Thumbs.db
67 | 
68 | # Logs
69 | *.log
70 | 
71 | # Temporary files
72 | *.tmp
73 | *.temp*.sqlite3
74 | *.db
75 | *.sqlite3
76 | *.db
77 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 Dimitri Tholen
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | # Pre-commit hooks for raggy
 2 | repos:
 3 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 4 |     rev: v4.5.0
 5 |     hooks:
 6 |       - id: trailing-whitespace
 7 |       - id: end-of-file-fixer
 8 |       - id: check-yaml
 9 |       - id: check-json
10 |       - id: check-toml
11 |       - id: check-merge-conflict
12 |       - id: check-added-large-files
13 |         args: ['--maxkb=1000']
14 |       - id: debug-statements
15 |       - id: check-docstring-first
16 | 
17 |   - repo: https://github.com/astral-sh/ruff-pre-commit
18 |     rev: v0.1.9
19 |     hooks:
20 |       - id: ruff
21 |         args: [--fix]
22 |       - id: ruff-format
23 | 
24 |   - repo: https://github.com/pre-commit/mirrors-mypy
25 |     rev: v1.8.0
26 |     hooks:
27 |       - id: mypy
28 |         args: [--ignore-missing-imports]
29 |         additional_dependencies: [types-PyYAML]
30 | 
31 |   - repo: https://github.com/PyCQA/bandit
32 |     rev: '1.7.5'
33 |     hooks:
34 |       - id: bandit
35 |         args: ['-c', 'pyproject.toml']
36 |         additional_dependencies: ['bandit[toml]']
37 | 
38 |   - repo: local
39 |     hooks:
40 |       - id: raggy-self-test
41 |         name: Raggy Self Test
42 |         entry: python raggy.py test
43 |         language: system
44 |         files: raggy\.py$
45 |         pass_filenames: false
46 | 
47 |       - id: pytest
48 |         name: Run tests
49 |         entry: pytest
50 |         language: system
51 |         files: \.(py)$
52 |         args: [tests/, --tb=short]
53 |         pass_filenames: false


--------------------------------------------------------------------------------
/raggy/cli/factory.py:
--------------------------------------------------------------------------------
 1 | """Factory for creating command instances."""
 2 | 
 3 | from .base import Command
 4 | from .commands import (
 5 |     BuildCommand,
 6 |     DiagnoseCommand,
 7 |     ForgetCommand,
 8 |     InitCommand,
 9 |     InteractiveCommand,
10 |     OptimizeCommand,
11 |     RecallCommand,
12 |     RememberCommand,
13 |     SearchCommand,
14 |     StatusCommand,
15 |     TestCommand,
16 |     ValidateCommand,
17 | )
18 | 
19 | 
20 | class CommandFactory:
21 |     """Factory for creating command instances."""
22 | 
23 |     _commands = {
24 |         "init": InitCommand,
25 |         "build": BuildCommand,
26 |         "rebuild": BuildCommand,
27 |         "search": SearchCommand,
28 |         "interactive": InteractiveCommand,
29 |         "status": StatusCommand,
30 |         "optimize": OptimizeCommand,
31 |         "test": TestCommand,
32 |         "diagnose": DiagnoseCommand,
33 |         "validate": ValidateCommand,
34 |         "remember": RememberCommand,
35 |         "recall": RecallCommand,
36 |         "forget": ForgetCommand,
37 |     }
38 | 
39 |     @classmethod
40 |     def create_command(cls, command_name: str) -> Command:
41 |         """Create a command instance.
42 | 
43 |         Args:
44 |             command_name: Name of the command to create
45 | 
46 |         Returns:
47 |             Command: Command instance
48 | 
49 |         Raises:
50 |             ValueError: If command name is unknown
51 | 
52 |         """
53 |         command_class = cls._commands.get(command_name)
54 |         if command_class is None:
55 |             raise ValueError(f"Unknown command: {command_name}")
56 |         return command_class()
57 | 


--------------------------------------------------------------------------------
/raggy/config/cache.py:
--------------------------------------------------------------------------------
 1 | """Cache management for dependencies and other temporary data."""
 2 | 
 3 | import json
 4 | from pathlib import Path
 5 | from typing import Any, Dict
 6 | 
 7 | from ..utils.logging import log_warning
 8 | 
 9 | 
10 | def get_cache_file() -> Path:
11 |     """Get path for dependency cache file.
12 | 
13 |     Returns:
14 |         Path: Path to the cache file
15 | 
16 |     """
17 |     return Path.cwd() / ".raggy_deps_cache.json"
18 | 
19 | 
20 | def load_deps_cache() -> Dict[str, Any]:
21 |     """Load dependency cache from file.
22 | 
23 |     Returns:
24 |         Dict[str, Any]: Cached dependency information or empty dict if not found
25 | 
26 |     """
27 |     cache_file = get_cache_file()
28 |     if cache_file.exists():
29 |         try:
30 |             with open(cache_file) as f:
31 |                 return json.load(f)
32 |         except (FileNotFoundError, json.JSONDecodeError, PermissionError) as e:
33 |             # Cache loading is optional - use empty cache if unavailable
34 |             log_warning(
35 |                 f"Could not load dependency cache from {cache_file.name}, using empty cache",
36 |                 e,
37 |                 quiet=True  # Debug-level issue, don't show to users
38 |             )
39 |     return {}
40 | 
41 | 
42 | def save_deps_cache(cache: Dict[str, Any]) -> None:
43 |     """Save dependency cache to file.
44 | 
45 |     Args:
46 |         cache: Cache dictionary to save
47 | 
48 |     """
49 |     cache_file = get_cache_file()
50 |     try:
51 |         with open(cache_file, "w") as f:
52 |             json.dump(cache, f)
53 |     except (OSError, PermissionError) as e:
54 |         # Cache saving is optional - continue without cache if write fails
55 |         log_warning(
56 |             f"Could not save dependency cache to {cache_file.name}, cache will not persist",
57 |             e,
58 |             quiet=True  # Debug-level issue, don't show to users
59 |         )
60 | 


--------------------------------------------------------------------------------
/raggy/utils/security.py:
--------------------------------------------------------------------------------
 1 | """Security utility functions for path validation and error sanitization."""
 2 | 
 3 | import re
 4 | from pathlib import Path
 5 | from typing import Optional
 6 | 
 7 | # Pre-compiled regex patterns for security scanning
 8 | WINDOWS_PATH_PATTERN = re.compile(r'[A-Za-z]:[\\\/][^\\\/\s]*[\\\/]')
 9 | UNIX_PATH_PATTERN = re.compile(r'\/[^\/\s]*\/')
10 | FILE_URL_PATTERN = re.compile(r'\bfile:\/\/[^\s]*')
11 | 
12 | 
13 | def validate_path(file_path: Path, base_path: Optional[Path] = None) -> bool:
14 |     """Validate file path to prevent directory traversal attacks.
15 | 
16 |     Args:
17 |         file_path: The path to validate
18 |         base_path: The base directory to check against (defaults to current working directory)
19 | 
20 |     Returns:
21 |         bool: True if the path is safe (within base directory), False otherwise
22 | 
23 |     """
24 |     try:
25 |         # Resolve the path to get absolute path
26 |         resolved_path = file_path.resolve()
27 | 
28 |         if base_path is None:
29 |             base_path = Path.cwd()
30 |         else:
31 |             base_path = base_path.resolve()
32 | 
33 |         # Check if the resolved path is within the base directory
34 |         try:
35 |             resolved_path.relative_to(base_path)
36 |             return True
37 |         except ValueError:
38 |             # Path is outside the base directory
39 |             return False
40 |     except (OSError, ValueError):
41 |         return False
42 | 
43 | 
44 | def sanitize_error_message(error_msg: str) -> str:
45 |     """Sanitize error messages to prevent information leakage.
46 | 
47 |     Args:
48 |         error_msg: The error message to sanitize
49 | 
50 |     Returns:
51 |         str: Sanitized error message with sensitive paths removed
52 | 
53 |     """
54 |     # Remove potentially sensitive path information using pre-compiled patterns
55 |     sanitized = WINDOWS_PATH_PATTERN.sub('', error_msg)  # Windows paths
56 |     sanitized = UNIX_PATH_PATTERN.sub('/', sanitized)  # Unix paths
57 |     return FILE_URL_PATTERN.sub('[FILE_PATH]', sanitized)
58 | 


--------------------------------------------------------------------------------
/raggy/embeddings/provider.py:
--------------------------------------------------------------------------------
 1 | """Abstract interface for embedding providers.
 2 | 
 3 | This module defines the standard interface that all embedding providers
 4 | must implement, allowing for pluggable local and cloud embedding models.
 5 | """
 6 | 
 7 | from abc import ABC, abstractmethod
 8 | from typing import List, Union
 9 | 
10 | import numpy as np
11 | 
12 | 
13 | class EmbeddingProvider(ABC):
14 |     """Abstract base class for embedding providers.
15 | 
16 |     All embedding providers (local models, OpenAI, etc.) must implement
17 |     this interface to ensure compatibility with Raggy's RAG system.
18 |     """
19 | 
20 |     @abstractmethod
21 |     def encode(
22 |         self,
23 |         texts: Union[str, List[str]],
24 |         batch_size: int = 32,
25 |         show_progress: bool = False,
26 |     ) -> np.ndarray:
27 |         """Encode text(s) into embeddings.
28 | 
29 |         Args:
30 |             texts: Single text string or list of texts to encode
31 |             batch_size: Batch size for processing (used by some providers)
32 |             show_progress: Whether to show progress bar
33 | 
34 |         Returns:
35 |             np.ndarray: Embeddings array of shape (num_texts, embedding_dim)
36 |                 For single text input, returns shape (1, embedding_dim)
37 | 
38 |         Raises:
39 |             ValueError: If texts is empty or invalid
40 |             RuntimeError: If encoding fails
41 | 
42 |         """
43 | 
44 |     @abstractmethod
45 |     def get_dimension(self) -> int:
46 |         """Get the dimension of embeddings produced by this provider.
47 | 
48 |         Returns:
49 |             int: Embedding dimension (e.g., 384, 1536, 3072)
50 | 
51 |         """
52 | 
53 |     @abstractmethod
54 |     def get_model_name(self) -> str:
55 |         """Get the name/identifier of the embedding model.
56 | 
57 |         Returns:
58 |             str: Model name (e.g., "all-MiniLM-L6-v2", "text-embedding-3-small")
59 | 
60 |         """
61 | 
62 |     def __repr__(self) -> str:
63 |         """String representation of provider."""
64 |         return f"{self.__class__.__name__}(model={self.get_model_name()}, dim={self.get_dimension()})"
65 | 


--------------------------------------------------------------------------------
/raggy/config/loader.py:
--------------------------------------------------------------------------------
 1 | """Configuration loading and management."""
 2 | 
 3 | from pathlib import Path
 4 | from typing import Any, Dict, Optional
 5 | 
 6 | from ..utils.logging import log_warning
 7 | from .constants import DEFAULT_CONFIG
 8 | 
 9 | 
10 | def load_config(config_path: Optional[str] = None) -> Dict[str, Any]:
11 |     """Load optional configuration file.
12 | 
13 |     Args:
14 |         config_path: Optional path to configuration file (defaults to raggy_config.yaml)
15 | 
16 |     Returns:
17 |         Dict[str, Any]: Merged configuration dictionary
18 | 
19 |     """
20 |     default_config = DEFAULT_CONFIG.copy()
21 | 
22 |     # Try to load config file
23 |     config_file = Path(config_path or "raggy_config.yaml")
24 |     if config_file.exists():
25 |         try:
26 |             import yaml
27 | 
28 |             with open(config_file) as f:
29 |                 user_config = yaml.safe_load(f)
30 | 
31 |             # Merge with defaults
32 |             _merge_configs(default_config, user_config)
33 |         except ImportError:
34 |             log_warning("PyYAML not installed, using default config", quiet=False)
35 |         except (FileNotFoundError, PermissionError, OSError) as e:
36 |             log_warning(f"Could not access config file {config_file}", e, quiet=False)
37 |         except (AttributeError, TypeError, ValueError) as e:
38 |             # Handle YAML parsing errors - yaml.YAMLError inherits from Exception
39 |             # but we catch common parsing issues (invalid structure, types, values)
40 |             log_warning(f"Invalid YAML format in {config_file}", e, quiet=False)
41 | 
42 |     return default_config
43 | 
44 | 
45 | def _merge_configs(default: Dict[str, Any], user: Dict[str, Any]) -> None:
46 |     """Recursively merge user config into default config.
47 | 
48 |     Args:
49 |         default: Default configuration dictionary (modified in place)
50 |         user: User configuration dictionary to merge
51 | 
52 |     """
53 |     for key, value in user.items():
54 |         if (
55 |             key in default
56 |             and isinstance(default[key], dict)
57 |             and isinstance(value, dict)
58 |         ):
59 |             _merge_configs(default[key], value)
60 |         else:
61 |             default[key] = value
62 | 


--------------------------------------------------------------------------------
/raggy/utils/logging.py:
--------------------------------------------------------------------------------
 1 | """Logging utility functions for consistent error and warning handling."""
 2 | 
 3 | from pathlib import Path
 4 | from typing import Optional
 5 | 
 6 | from .security import sanitize_error_message
 7 | 
 8 | 
 9 | def log_error(message: str, error: Optional[Exception] = None, *, quiet: bool = False) -> None:
10 |     """Centralized error logging with consistent formatting.
11 | 
12 |     Args:
13 |         message: The error message to log
14 |         error: Optional exception to include in the message
15 |         quiet: If True, suppress output
16 | 
17 |     """
18 |     if quiet:
19 |         return
20 | 
21 |     if error:
22 |         sanitized_error = sanitize_error_message(str(error))
23 |         print(f"ERROR: {message}: {sanitized_error}")
24 |     else:
25 |         print(f"ERROR: {message}")
26 | 
27 | 
28 | def log_warning(message: str, error: Optional[Exception] = None, *, quiet: bool = False) -> None:
29 |     """Centralized warning logging with consistent formatting.
30 | 
31 |     Args:
32 |         message: The warning message to log
33 |         error: Optional exception to include in the message
34 |         quiet: If True, suppress output
35 | 
36 |     """
37 |     if quiet:
38 |         return
39 | 
40 |     if error:
41 |         sanitized_error = sanitize_error_message(str(error))
42 |         print(f"Warning: {message}: {sanitized_error}")
43 |     else:
44 |         print(f"Warning: {message}")
45 | 
46 | 
47 | def handle_file_error(file_path: Path, operation: str, error: Exception, *, quiet: bool = False) -> None:
48 |     """Standardized file operation error handling.
49 | 
50 |     Args:
51 |         file_path: The path to the file that caused the error
52 |         operation: The operation being performed (e.g., 'read', 'write')
53 |         error: The exception that occurred
54 |         quiet: If True, suppress output
55 | 
56 |     """
57 |     if isinstance(error, (FileNotFoundError, PermissionError)):
58 |         log_error(f"Cannot {operation} {file_path.name} - {type(error).__name__}", quiet=quiet)
59 |     elif isinstance(error, UnicodeDecodeError):
60 |         log_error(f"Cannot {operation} {file_path.name} - encoding issue", quiet=quiet)
61 |     else:
62 |         log_error(f"Cannot {operation} {file_path.name}", error, quiet=quiet)
63 | 


--------------------------------------------------------------------------------
/raggy/config/constants.py:
--------------------------------------------------------------------------------
 1 | """Configuration constants for the RAG system."""
 2 | 
 3 | from typing import Any, Dict
 4 | 
 5 | # Version information
 6 | __version__ = "2.0.0"
 7 | 
 8 | # File reading constants
 9 | CHUNK_READ_SIZE = 8192  # 8KB chunks for file reading
10 | MAX_CACHE_SIZE = 1000  # Maximum number of cached embeddings
11 | CACHE_TTL = 3600  # Cache time-to-live in seconds (1 hour)
12 | MAX_FILE_SIZE_MB = 100  # Maximum file size in MB
13 | SESSION_CACHE_HOURS = 24  # Hours before update check
14 | UPDATE_TIMEOUT_SECONDS = 2  # API timeout for update checks
15 | 
16 | # Default chunking parameters
17 | DEFAULT_CHUNK_SIZE = 1000
18 | DEFAULT_CHUNK_OVERLAP = 200
19 | DEFAULT_RESULTS = 5
20 | DEFAULT_CONTEXT_CHARS = 200
21 | DEFAULT_HYBRID_WEIGHT = 0.7
22 | 
23 | # Input validation ranges
24 | MIN_CHUNK_SIZE = 100
25 | MAX_CHUNK_SIZE = 10000
26 | MIN_CHUNK_OVERLAP = 0
27 | MIN_TOP_K = 1
28 | MAX_TOP_K = 100
29 | MAX_QUERY_LENGTH = 10000
30 | 
31 | # File type constants
32 | SUPPORTED_EXTENSIONS = [".md", ".pdf", ".docx", ".txt"]
33 | GLOB_PATTERNS = ["**/*.md", "**/*.pdf", "**/*.docx", "**/*.txt"]
34 | 
35 | # Model presets
36 | FAST_MODEL = "paraphrase-MiniLM-L3-v2"
37 | DEFAULT_MODEL = "all-MiniLM-L6-v2"
38 | MULTILINGUAL_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
39 | ACCURATE_MODEL = "all-mpnet-base-v2"
40 | 
41 | # Default configuration structure
42 | DEFAULT_CONFIG: Dict[str, Any] = {
43 |     "model": DEFAULT_MODEL,
44 |     "chunk_size": DEFAULT_CHUNK_SIZE,
45 |     "chunk_overlap": DEFAULT_CHUNK_OVERLAP,
46 |     "default_results": DEFAULT_RESULTS,
47 |     "context_chars": DEFAULT_CONTEXT_CHARS,
48 |     "excluded_dirs": [
49 |         # Version control and dependencies
50 |         ".git", "node_modules", ".venv", "venv", "__pycache__",
51 |         # Build and distribution
52 |         "dist", "build", "*.egg-info",
53 |         # IDEs and editors
54 |         ".idea", ".vscode",
55 |         # Misc
56 |         "chroma_db", "vectordb", ".chromadb", ".raggydb",
57 |         ".pytest_cache", ".mypy_cache", ".ruff_cache",
58 |     ],
59 |     "supported_extensions": SUPPORTED_EXTENSIONS,
60 |     "search": {
61 |         "hybrid_weight": DEFAULT_HYBRID_WEIGHT,
62 |         "expand_queries": False,
63 |         "boost_exact": True,
64 |     },
65 |     "updates": {
66 |         "check_enabled": True,
67 |         "github_repo": "dimitritholen/raggy",
68 |     }
69 | }
70 | 


--------------------------------------------------------------------------------
/raggy/__init__.py:
--------------------------------------------------------------------------------
 1 | """Raggy - Universal RAG system for document search and retrieval.
 2 | 
 3 | This package provides:
 4 | - UniversalRAG: Main RAG system for document search and retrieval
 5 | - Memory: AI development memory system for context persistence
 6 | - remember/recall: Convenience functions for quick memory operations
 7 | 
 8 | Example:
 9 |     >>> from raggy import UniversalRAG, Memory
10 |     >>>
11 |     >>> # Document search
12 |     >>> rag = UniversalRAG(docs_dir="./docs")
13 |     >>> results = rag.search("machine learning algorithms")
14 |     >>>
15 |     >>> # Development memory
16 |     >>> memory = Memory(db_dir="./vectordb")
17 |     >>> mem_id = memory.add(
18 |     ...     "Decided to use ChromaDB for vector storage",
19 |     ...     memory_type="decision",
20 |     ...     tags=["architecture", "database"]
21 |     ... )
22 |     >>> results = memory.search("database decisions")
23 | 
24 | """
25 | 
26 | from raggy.cli.factory import CommandFactory
27 | from raggy.config.loader import load_config
28 | from raggy.core.database import DatabaseManager
29 | from raggy.core.document import DocumentProcessor
30 | from raggy.core.memory import Memory, recall, remember
31 | from raggy.core.rag import UniversalRAG
32 | from raggy.core.search import SearchEngine
33 | from raggy.query.processor import QueryProcessor
34 | from raggy.scoring.bm25 import BM25Scorer
35 | from raggy.scoring.normalization import (
36 |     interpret_score,
37 |     normalize_cosine_distance,
38 |     normalize_hybrid_score,
39 | )
40 | from raggy.setup.dependencies import install_if_missing, setup_dependencies
41 | from raggy.setup.environment import setup_environment
42 | from raggy.utils.updates import check_for_updates
43 | 
44 | __version__ = "2.0.0"
45 | 
46 | __all__ = [
47 |     # Core RAG system
48 |     "UniversalRAG",
49 |     "SearchEngine",
50 |     "DatabaseManager",
51 |     "DocumentProcessor",
52 |     # Memory system (new in 2.0)
53 |     "Memory",
54 |     "remember",
55 |     "recall",
56 |     # Scoring and normalization
57 |     "normalize_cosine_distance",
58 |     "normalize_hybrid_score",
59 |     "interpret_score",
60 |     "BM25Scorer",
61 |     # Query processing
62 |     "QueryProcessor",
63 |     # CLI and configuration
64 |     "CommandFactory",
65 |     "load_config",
66 |     # Setup utilities
67 |     "setup_environment",
68 |     "setup_dependencies",
69 |     "install_if_missing",
70 |     "check_for_updates",
71 | ]
72 | 


--------------------------------------------------------------------------------
/raggy/scoring/normalization.py:
--------------------------------------------------------------------------------
 1 | """Score normalization functions for search results."""
 2 | 
 3 | from typing import Optional
 4 | 
 5 | 
 6 | def normalize_cosine_distance(distance: float) -> float:
 7 |     """Normalize cosine distance (0-2 range) to similarity score (0-1 range).
 8 | 
 9 |     Args:
10 |         distance: Cosine distance value (0-2 range, where 0 is identical)
11 | 
12 |     Returns:
13 |         float: Normalized score (0-1 range, where 1 is perfect match)
14 | 
15 |     """
16 |     # Convert cosine distance (0-2) to similarity (0-1)
17 |     # Distance of 0 = similarity of 1 (identical)
18 |     # Distance of 2 = similarity of 0 (opposite)
19 |     return max(0.0, min(1.0, 1.0 - (distance / 2.0)))
20 | 
21 | 
22 | def normalize_hybrid_score(
23 |     semantic_score: float,
24 |     keyword_score: float,
25 |     weight: float = 0.7,
26 |     semantic_boost: Optional[float] = None
27 | ) -> float:
28 |     """Combine and normalize semantic and keyword scores.
29 | 
30 |     Args:
31 |         semantic_score: Normalized semantic similarity score (0-1)
32 |         keyword_score: BM25 keyword score (unbounded)
33 |         weight: Weight for semantic score (0-1), remainder goes to keyword
34 |         semantic_boost: Optional boost factor for high semantic scores
35 | 
36 |     Returns:
37 |         float: Combined normalized score (0-1)
38 | 
39 |     """
40 |     # Normalize BM25 score to 0-1 range (sigmoid-like transformation)
41 |     # BM25 scores typically range from 0-20, we'll use a soft cap at 10
42 |     normalized_keyword = min(1.0, keyword_score / 10.0)
43 | 
44 |     # Apply semantic boost if specified and semantic score is high
45 |     if semantic_boost and semantic_score > 0.8:
46 |         semantic_score = min(1.0, semantic_score * semantic_boost)
47 | 
48 |     # Weighted combination
49 |     combined = (weight * semantic_score) + ((1 - weight) * normalized_keyword)
50 | 
51 |     return min(1.0, combined)  # Ensure max score is 1.0
52 | 
53 | 
54 | def interpret_score(score: float) -> str:
55 |     """Convert normalized score to human-readable interpretation.
56 | 
57 |     Args:
58 |         score: Normalized score (0-1 range)
59 | 
60 |     Returns:
61 |         str: Human-readable score interpretation
62 | 
63 |     """
64 |     if score >= 0.9:
65 |         return "Excellent"
66 |     elif score >= 0.7:
67 |         return "Good"
68 |     elif score >= 0.5:
69 |         return "Fair"
70 |     elif score >= 0.3:
71 |         return "Weak"
72 |     else:
73 |         return "Poor"
74 | 


--------------------------------------------------------------------------------
/docs/artifacts/QUALITY_VIOLATIONS.csv:
--------------------------------------------------------------------------------
 1 | File,Line,Severity,Type,Rule,Description,Auto-Fix,Estimated Effort
 2 | raggy/core/supabase_adapter.py,466,BLOCKING,Complexity,CC=12,SupabaseCollection.update exceeds complexity threshold,No,1-2 hours
 3 | raggy/core/supabase_adapter.py,352,BLOCKING,Complexity,CC=10,SupabaseCollection.get at complexity threshold,No,1 hour
 4 | raggy/core/pinecone_adapter.py,143,HIGH,Exception,Broad catch,Catching bare Exception instead of specific types,No,15 min
 5 | raggy/core/supabase_adapter.py,71,HIGH,Exception,SIM105,Suppressible exception - use contextlib.suppress,Yes,Auto
 6 | raggy/core/supabase_adapter.py,205,HIGH,Exception,Broad catch,Catching bare Exception instead of specific types,No,15 min
 7 | raggy/core/pinecone_adapter.py,253,MEDIUM,Style,E501,Line too long (89 > 88 characters),No,2 min
 8 | raggy/core/pinecone_adapter.py,428,MEDIUM,Complexity,CC=10,PineconeCollection.update at threshold,No,Monitor
 9 | raggy/core/pinecone_adapter.py,166,MEDIUM,Complexity,CC=9,PineconeCollection.add near threshold,No,Monitor
10 | raggy/core/supabase_adapter.py,228,LOW,Complexity,CC=8,SupabaseCollection.add acceptable,No,Monitor
11 | raggy/core/supabase_adapter.py,432,LOW,Complexity,CC=8,SupabaseCollection.delete acceptable,No,Monitor
12 | raggy/config/raggy_config.py,107,LOW,Style,UP015,Redundant open mode 'r',Yes,Auto
13 | raggy/config/raggy_config.py,116,LOW,Style,RET504,Unnecessary assignment before return,No,2 min
14 | raggy/config/raggy_config.py,294,LOW,Docstring,D401,Imperative mood violation in __repr__,No,1 min
15 | raggy/core/vector_store_factory.py,3,LOW,Import,I001,Import block unsorted,Yes,Auto
16 | raggy/core/supabase_adapter.py,67,LOW,Style,F841,Unused variable 'result',Yes,Auto
17 | raggy/embeddings/__init__.py,7,LOW,Import,I001,Import block unsorted,Yes,Auto
18 | raggy/embeddings/factory.py,3,LOW,Import,I001,Import block unsorted,Yes,Auto
19 | raggy/embeddings/provider.py,42,LOW,Style,PIE790,Unnecessary pass statement,Yes,Auto
20 | raggy/embeddings/provider.py,51,LOW,Style,PIE790,Unnecessary pass statement,Yes,Auto
21 | raggy/embeddings/provider.py,60,LOW,Style,PIE790,Unnecessary pass statement,Yes,Auto
22 | raggy/embeddings/provider.py,63,LOW,Docstring,D401,Imperative mood violation in __repr__,No,1 min
23 | raggy/embeddings/openai_provider.py,117,LOW,Style,RET506,Unnecessary elif after raise,Yes,Auto
24 | raggy/embeddings/sentence_transformers_provider.py,83,LOW,Style,RET504,Unnecessary assignment before return,No,2 min
25 | Multiple files,Various,LOW,Docstring,D413,Missing blank line after docstring sections (51 occurrences),Yes,Auto
26 | 


--------------------------------------------------------------------------------
/.raggy.json.example:
--------------------------------------------------------------------------------
 1 | {
 2 |   "_comment": "Raggy Configuration File - Copy to .raggy.json and customize",
 3 |   "_description": "This file shows all available configuration options for Raggy RAG system",
 4 | 
 5 |   "vectorStore": {
 6 |     "_comment": "Vector database configuration - choose provider and configure settings",
 7 |     "provider": "chromadb",
 8 | 
 9 |     "chromadb": {
10 |       "_comment": "Local vector database using ChromaDB (default, no API key needed)",
11 |       "path": "./vectordb"
12 |     },
13 | 
14 |     "pinecone": {
15 |       "_comment": "Pinecone cloud vector database - requires API key and environment",
16 |       "_install": "pip install raggy[pinecone]",
17 |       "apiKey": "${PINECONE_API_KEY}",
18 |       "environment": "us-east-1-aws",
19 |       "indexName": "raggy-index",
20 |       "dimension": 384
21 |     },
22 | 
23 |     "supabase": {
24 |       "_comment": "Supabase (PostgreSQL + pgvector) - requires project URL and API key",
25 |       "_install": "pip install raggy[supabase]",
26 |       "url": "${SUPABASE_URL}",
27 |       "apiKey": "${SUPABASE_ANON_KEY}",
28 |       "dimension": 384
29 |     }
30 |   },
31 | 
32 |   "embedding": {
33 |     "_comment": "Embedding model configuration - choose provider and model",
34 |     "provider": "sentence-transformers",
35 | 
36 |     "sentenceTransformers": {
37 |       "_comment": "Local embedding models (default, no API key needed)",
38 |       "_models": "all-MiniLM-L6-v2 (384-dim, fast), all-mpnet-base-v2 (768-dim, accurate)",
39 |       "model": "all-MiniLM-L6-v2",
40 |       "device": "cpu"
41 |     },
42 | 
43 |     "openai": {
44 |       "_comment": "OpenAI embedding models - requires API key",
45 |       "_install": "pip install raggy[openai]",
46 |       "_models": "text-embedding-3-small (1536-dim), text-embedding-3-large (3072-dim)",
47 |       "apiKey": "${OPENAI_API_KEY}",
48 |       "model": "text-embedding-3-small"
49 |     }
50 |   },
51 | 
52 |   "memory": {
53 |     "_comment": "Memory categories configuration - customize or extend default categories",
54 |     "categoriesMode": "append",
55 | 
56 |     "_modesDescription": {
57 |       "append": "Use defaults + add custom categories - remove specified",
58 |       "replace": "Ignore defaults, use only replacement categories",
59 |       "custom": "Use only custom added categories (no defaults)"
60 |     },
61 | 
62 |     "categories": {
63 |       "add": ["meeting", "research", "architecture-review"],
64 |       "remove": ["error"],
65 |       "replace": ["bug", "feature", "refactor", "docs", "test"]
66 |     },
67 | 
68 |     "_defaultCategories": ["decision", "solution", "pattern", "learning", "error", "note"]
69 |   }
70 | }
71 | 


--------------------------------------------------------------------------------
/raggy/embeddings/factory.py:
--------------------------------------------------------------------------------
 1 | """Factory for creating embedding providers based on configuration."""
 2 | 
 3 | from typing import Any, Dict
 4 | 
 5 | from .openai_provider import OpenAIProvider
 6 | from .provider import EmbeddingProvider
 7 | from .sentence_transformers_provider import SentenceTransformersProvider
 8 | 
 9 | 
10 | def create_embedding_provider(config: Dict[str, Any]) -> EmbeddingProvider:
11 |     """Create an embedding provider based on configuration.
12 | 
13 |     Args:
14 |         config: Embedding configuration dictionary with structure:
15 |             {
16 |                 "provider": "sentence-transformers" | "openai",
17 |                 "sentenceTransformers": {"model": "..."},
18 |                 "openai": {"apiKey": "...", "model": "..."}
19 |             }
20 | 
21 |     Returns:
22 |         EmbeddingProvider: Configured embedding provider instance
23 | 
24 |     Raises:
25 |         ValueError: If provider is unknown or configuration is invalid
26 |         RuntimeError: If provider initialization fails
27 | 
28 |     Example:
29 |         >>> config = {
30 |         ...     "provider": "openai",
31 |         ...     "openai": {
32 |         ...         "apiKey": "sk-...",
33 |         ...         "model": "text-embedding-3-small"
34 |         ...     }
35 |         ... }
36 |         >>> provider = create_embedding_provider(config)
37 | 
38 |     """
39 |     provider_type = config.get("provider", "sentence-transformers")
40 | 
41 |     if provider_type == "sentence-transformers":
42 |         st_config = config.get("sentenceTransformers", {})
43 |         model_name = st_config.get("model", "all-MiniLM-L6-v2")
44 |         device = st_config.get("device", "cpu")
45 | 
46 |         return SentenceTransformersProvider(
47 |             model_name=model_name,
48 |             device=device
49 |         )
50 | 
51 |     elif provider_type == "openai":
52 |         openai_config = config.get("openai", {})
53 | 
54 |         if not openai_config:
55 |             raise ValueError(
56 |                 "OpenAI configuration missing. Please provide 'openai' config with 'apiKey' and 'model'."
57 |             )
58 | 
59 |         api_key = openai_config.get("apiKey")
60 |         if not api_key:
61 |             raise ValueError(
62 |                 "OpenAI API key missing. Please set 'embedding.openai.apiKey' in .raggy.json "
63 |                 "or use environment variable: ${OPENAI_API_KEY}"
64 |             )
65 | 
66 |         model = openai_config.get("model", "text-embedding-3-small")
67 | 
68 |         return OpenAIProvider(
69 |             api_key=api_key,
70 |             model=model
71 |         )
72 | 
73 |     else:
74 |         raise ValueError(
75 |             f"Unknown embedding provider: {provider_type}. "
76 |             f"Supported providers: sentence-transformers, openai"
77 |         )
78 | 


--------------------------------------------------------------------------------
/raggy/embeddings/sentence_transformers_provider.py:
--------------------------------------------------------------------------------
  1 | """Sentence Transformers embedding provider.
  2 | 
  3 | This module provides a local embedding provider using the sentence-transformers
  4 | library for offline, privacy-preserving embeddings.
  5 | """
  6 | 
  7 | from typing import List, Union
  8 | 
  9 | import numpy as np
 10 | 
 11 | from .provider import EmbeddingProvider
 12 | 
 13 | 
 14 | class SentenceTransformersProvider(EmbeddingProvider):
 15 |     """Local embedding provider using sentence-transformers.
 16 | 
 17 |     This provider uses the sentence-transformers library to generate embeddings
 18 |     locally without requiring API calls or internet connectivity.
 19 |     """
 20 | 
 21 |     def __init__(self, model_name: str = "all-MiniLM-L6-v2", device: str = "cpu"):
 22 |         """Initialize sentence-transformers provider.
 23 | 
 24 |         Args:
 25 |             model_name: Name of the sentence-transformers model
 26 |             device: Device to run on ("cpu" or "cuda")
 27 | 
 28 |         Raises:
 29 |             ImportError: If sentence-transformers not installed
 30 |             RuntimeError: If model loading fails
 31 | 
 32 |         """
 33 |         try:
 34 |             from sentence_transformers import SentenceTransformer
 35 |         except ImportError as e:
 36 |             raise ImportError(
 37 |                 "sentence-transformers not installed. "
 38 |                 "Install with: pip install sentence-transformers"
 39 |             ) from e
 40 | 
 41 |         self.model_name = model_name
 42 |         self.device = device
 43 | 
 44 |         try:
 45 |             self._model = SentenceTransformer(model_name, device=device)
 46 |             self._dimension = self._model.get_sentence_embedding_dimension()
 47 |         except Exception as e:
 48 |             raise RuntimeError(f"Failed to load model {model_name}: {e}") from e
 49 | 
 50 |     def encode(
 51 |         self,
 52 |         texts: Union[str, List[str]],
 53 |         batch_size: int = 32,
 54 |         show_progress: bool = False,
 55 |     ) -> np.ndarray:
 56 |         """Encode text(s) into embeddings.
 57 | 
 58 |         Args:
 59 |             texts: Single text string or list of texts to encode
 60 |             batch_size: Batch size for processing
 61 |             show_progress: Whether to show progress bar
 62 | 
 63 |         Returns:
 64 |             np.ndarray: Embeddings array of shape (num_texts, embedding_dim)
 65 | 
 66 |         Raises:
 67 |             ValueError: If texts is empty or invalid
 68 |             RuntimeError: If encoding fails
 69 | 
 70 |         """
 71 |         if not texts:
 72 |             raise ValueError("texts cannot be empty")
 73 | 
 74 |         # Convert single string to list
 75 |         if isinstance(texts, str):
 76 |             texts = [texts]
 77 | 
 78 |         try:
 79 |             return self._model.encode(
 80 |                 texts,
 81 |                 batch_size=batch_size,
 82 |                 show_progress_bar=show_progress,
 83 |                 convert_to_numpy=True,
 84 |             )
 85 |         except Exception as e:
 86 |             raise RuntimeError(f"Failed to encode texts: {e}") from e
 87 | 
 88 |     def get_dimension(self) -> int:
 89 |         """Get the dimension of embeddings.
 90 | 
 91 |         Returns:
 92 |             int: Embedding dimension
 93 | 
 94 |         """
 95 |         return self._dimension
 96 | 
 97 |     def get_model_name(self) -> str:
 98 |         """Get the model name.
 99 | 
100 |         Returns:
101 |             str: Model name
102 | 
103 |         """
104 |         return self.model_name
105 | 


--------------------------------------------------------------------------------
/raggy/scoring/bm25.py:
--------------------------------------------------------------------------------
  1 | """BM25 scoring implementation for keyword-based search."""
  2 | 
  3 | import math
  4 | from collections import Counter, defaultdict
  5 | from typing import Dict, List
  6 | 
  7 | from ..utils.patterns import WORD_PATTERN
  8 | 
  9 | 
 10 | class BM25Scorer:
 11 |     """Lightweight BM25 implementation for keyword scoring.
 12 | 
 13 |     BM25 is a probabilistic ranking function used for estimating the relevance
 14 |     of documents to a given search query.
 15 |     """
 16 | 
 17 |     def __init__(self, k1: float = 1.2, b: float = 0.75) -> None:
 18 |         """Initialize BM25 scorer with tuning parameters.
 19 | 
 20 |         Args:
 21 |             k1: Controls term frequency saturation (default 1.2)
 22 |             b: Controls length normalization (default 0.75)
 23 | 
 24 |         """
 25 |         self.k1 = k1
 26 |         self.b = b
 27 |         self.doc_lengths: List[int] = []
 28 |         self.avg_doc_length = 0.0
 29 |         self.doc_count = 0
 30 |         self.term_frequencies: List[Dict[str, int]] = []
 31 |         self.idf_scores: Dict[str, float] = {}
 32 | 
 33 |     def fit(self, documents: List[str]) -> None:
 34 |         """Build BM25 index from documents.
 35 | 
 36 |         Args:
 37 |             documents: List of document texts to index
 38 | 
 39 |         """
 40 |         self.doc_count = len(documents)
 41 |         self.doc_lengths = []
 42 |         self.term_frequencies = []
 43 |         doc_term_counts: Dict[str, int] = defaultdict(int)
 44 | 
 45 |         # Calculate term frequencies and document lengths
 46 |         for doc in documents:
 47 |             terms = self._tokenize(doc)
 48 |             self.doc_lengths.append(len(terms))
 49 | 
 50 |             term_freq = Counter(terms)
 51 |             self.term_frequencies.append(term_freq)
 52 | 
 53 |             # Count documents containing each term
 54 |             for term in set(terms):
 55 |                 doc_term_counts[term] += 1
 56 | 
 57 |         self.avg_doc_length = (
 58 |             sum(self.doc_lengths) / len(self.doc_lengths)
 59 |             if self.doc_lengths else 0.0
 60 |         )
 61 | 
 62 |         # Calculate IDF scores
 63 |         for term, doc_freq in doc_term_counts.items():
 64 |             # Use standard BM25 IDF: log((N + 1) / df)
 65 |             # This avoids negative scores and is more stable for small datasets
 66 |             self.idf_scores[term] = math.log((self.doc_count + 1) / doc_freq)
 67 | 
 68 |     def score(self, query: str, doc_index: int) -> float:
 69 |         """Calculate BM25 score for query against document.
 70 | 
 71 |         Args:
 72 |             query: Search query text
 73 |             doc_index: Index of document to score
 74 | 
 75 |         Returns:
 76 |             float: BM25 relevance score (non-negative)
 77 | 
 78 |         """
 79 |         if doc_index < 0 or doc_index >= len(self.term_frequencies):
 80 |             return 0.0
 81 | 
 82 |         query_terms = self._tokenize(query)
 83 |         score = 0.0
 84 |         doc_length = self.doc_lengths[doc_index]
 85 |         term_freq = self.term_frequencies[doc_index]
 86 | 
 87 |         for term in query_terms:
 88 |             if term in term_freq:
 89 |                 tf = term_freq[term]
 90 |                 idf = self.idf_scores.get(term, 0.0)
 91 | 
 92 |                 numerator = tf * (self.k1 + 1)
 93 |                 length_normalization = (
 94 |                     1 - self.b + self.b * (doc_length / self.avg_doc_length)
 95 |                 )
 96 |                 denominator = tf + self.k1 * length_normalization
 97 |                 score += idf * (numerator / denominator)
 98 | 
 99 |         return max(0.0, score)  # Ensure non-negative scores
100 | 
101 |     def _tokenize(self, text: str) -> List[str]:
102 |         """Simple tokenization for text processing.
103 | 
104 |         Args:
105 |             text: Text to tokenize
106 | 
107 |         Returns:
108 |             List[str]: List of lowercase tokens
109 | 
110 |         """
111 |         # Convert to lowercase and extract alphanumeric sequences using pre-compiled pattern
112 |         return WORD_PATTERN.findall(text.lower())
113 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Changelog
 2 | 
 3 | All notable changes to the raggy project will be documented in this file.
 4 | 
 5 | ## 2025-11-13
 6 | 
 7 | ### Fixed
 8 | - **Exception Handling Security**: Replaced all 18 bare `except Exception` handlers with specific exception types
 9 |   - Eliminated OWASP A09:2021 violations (Security Logging Failures)
10 |   - Removed all silent failure patterns (bare `pass` statements)
11 |   - Implemented fail-fast design - programming errors now crash as intended
12 |   - Added specific handlers: `FileNotFoundError`, `PermissionError`, `yaml.YAMLError`, `UnicodeDecodeError`, etc.
13 |   - Files modified: `raggy/config/loader.py`, `raggy/core/database.py`, `raggy/core/document.py`, `raggy/core/rag.py`, `raggy/core/search.py`, `raggy/setup/dependencies.py`, `raggy_cli.py`
14 |   - Security verified: 0 HIGH severity issues in bandit scan
15 |   - Issue #1 from TODO_MEDIUM.md resolved (2-3 hours effort)
16 | 
17 | - **Silent Exception Logging**: Replaced 4 bare `pass` statements with proper logging
18 |   - Added context-aware logging for cache operations and session file handling
19 |   - All logging respects quiet mode (`quiet=True` for debug-level issues)
20 |   - Files modified: `raggy/config/cache.py`, `raggy/utils/updates.py`
21 |   - No silent failures remain in codebase (verified with `rg` search)
22 |   - Issue #2 from TODO_MEDIUM.md resolved (1 hour effort)
23 | 
24 | ### Changed
25 | - **DEPRECATED raggy.py**: Converted monolithic 2,919-line file to thin 243-line wrapper
26 |   - Reduced from 106 KB to 6.6 KB (94% reduction)
27 |   - All functionality now imported from modular `raggy/` package
28 |   - Added prominent deprecation warnings (will remove in v3.0.0)
29 |   - Maintained 100% backward compatibility - all existing scripts continue working
30 |   - Shows migration instructions pointing to `raggy_cli.py`
31 |   - Eliminated massive code duplication between raggy.py and raggy/ package
32 | 
33 | ### Technical Details
34 | - **Before**: 2,919 lines with CC=18-20 functions, 106 KB file size
35 | - **After**: 243 lines with CC=1 functions (simple delegates), 6.6 KB file size
36 | - **Imports preserved**: All classes, functions, and constants re-exported for compatibility
37 | - **Entry points preserved**: main(), parse_args(), _determine_model() all delegate to raggy_cli
38 | - **User impact**: Zero breaking changes, clear migration path shown
39 | 
40 | ## 2025-11-12
41 | 
42 | ### Added
43 | - **Specialized Sub-Agents**: Created 7 production-grade Python agents in `.claude/agents/`:
44 |   - `python-testing-engineer.md` - Fix broken tests, achieve 85% coverage
45 |   - `python-refactoring-architect.md` - Decompose God Module, eliminate duplication
46 |   - `python-complexity-reducer.md` - Reduce cyclomatic complexity from 20 to ≤10
47 |   - `python-security-auditor.md` - Fix os.execv vulnerability, OWASP compliance
48 |   - `python-rag-backend-engineer.md` - ChromaDB abstraction, hybrid search
49 |   - `python-document-processor.md` - PDF/DOCX/Markdown extraction with Strategy pattern
50 |   - `python-code-quality-engineer.md` - Ruff linting, mypy strict, docstrings
51 | 
52 | - **Project Instructions**: Created `.claude/CLAUDE.md` with mandatory agent delegation protocol:
53 |   - LEVEL 0 enforcement: MUST delegate to specialists (direct implementation forbidden)
54 |   - Task-to-Agent mapping with detailed decision tree
55 |   - Verification checklist before any code changes
56 |   - Multi-domain task coordination guidelines
57 |   - Quality gates and commit guidelines
58 | 
59 | ### Fixed
60 | - **Broken Test Suite**: Fixed ImportError in `tests/test_raggy.py` preventing all 92 tests from running
61 |   - Replaced non-existent `ScoringNormalizer` class import with module-level functions
62 |   - Updated 20 function calls to use `normalize_cosine_distance`, `normalize_hybrid_score`, `interpret_score`
63 |   - All 5 scoring normalization tests now passing (100%)
64 |   - Test suite operational: 116 tests collected (up from 0)
65 |   - Coverage improved: 15% (up from 12%, target: 85%)
66 |   - Issue #1 from TODO_CRITICAL.md resolved
67 | 
68 | ### Context
69 | - Agents generated based on comprehensive code audit findings
70 | - Total remediation effort: 34-52 hours (4-6 weeks at 10 hours/week)
71 | - Each agent includes:
72 |   - Maximum enforcement (BLOCKING quality gates)
73 |   - LEVEL 0/1/2 constraint hierarchy
74 |   - Anti-hallucination safeguards
75 |   - Few-shot examples (BEFORE/AFTER)
76 |   - 5 blocking quality gates each
77 |   - Context7 verification for external APIs
78 | 


--------------------------------------------------------------------------------
/raggy/core/database.py:
--------------------------------------------------------------------------------
  1 | """Database management for vector storage using abstract interface."""
  2 | 
  3 | from pathlib import Path
  4 | from typing import Any, Dict, List, Optional
  5 | 
  6 | from ..utils.logging import log_error
  7 | from .chromadb_adapter import ChromaDBAdapter
  8 | from .database_interface import VectorDatabase
  9 | 
 10 | 
 11 | class DatabaseManager:
 12 |     """Handles vector database operations through abstract interface."""
 13 | 
 14 |     def __init__(
 15 |         self,
 16 |         db_dir: Path,
 17 |         collection_name: str = "project_docs",
 18 |         quiet: bool = False,
 19 |         database: Optional[VectorDatabase] = None
 20 |     ) -> None:
 21 |         """Initialize database manager.
 22 | 
 23 |         Args:
 24 |             db_dir: Directory for database storage
 25 |             collection_name: Name of the collection
 26 |             quiet: If True, suppress output
 27 |             database: Optional VectorDatabase implementation (defaults to ChromaDB)
 28 | 
 29 |         """
 30 |         self.db_dir = db_dir
 31 |         self.collection_name = collection_name
 32 |         self.quiet = quiet
 33 | 
 34 |         # Use provided database or default to ChromaDBAdapter
 35 |         self._database = database or ChromaDBAdapter(path=str(self.db_dir))
 36 | 
 37 |     @property
 38 |     def client(self):
 39 |         """Get database instance for backward compatibility.
 40 | 
 41 |         Returns:
 42 |             VectorDatabase instance
 43 | 
 44 |         """
 45 |         return self._database
 46 | 
 47 |     def build_index(
 48 |         self,
 49 |         documents: List[Dict[str, Any]],
 50 |         embeddings: Any,
 51 |         force_rebuild: bool = False
 52 |     ) -> None:
 53 |         """Build or update the vector database.
 54 | 
 55 |         Args:
 56 |             documents: List of document chunks with text and metadata
 57 |             embeddings: Document embeddings array
 58 |             force_rebuild: If True, delete existing collection first
 59 | 
 60 |         """
 61 |         try:
 62 |             if force_rebuild:
 63 |                 try:
 64 |                     self._database.delete_collection(self.collection_name)
 65 |                     if not self.quiet:
 66 |                         print("Deleted existing collection")
 67 |                 except (ValueError, RuntimeError) as e:
 68 |                     # Collection may not exist - this is expected on first run
 69 |                     log_error("Could not delete collection (may not exist)", e, quiet=True)
 70 | 
 71 |             collection = self._database.get_or_create_collection(
 72 |                 name=self.collection_name,
 73 |                 metadata={"description": "Project documentation embeddings"},
 74 |             )
 75 | 
 76 |             # Add to database through abstract interface
 77 |             texts = [doc["text"] for doc in documents]
 78 |             collection.add(
 79 |                 embeddings=embeddings.tolist(),
 80 |                 documents=texts,
 81 |                 metadatas=[doc["metadata"] for doc in documents],
 82 |                 ids=[doc["id"] for doc in documents],
 83 |             )
 84 | 
 85 |         except (ValueError, RuntimeError, OSError) as e:
 86 |             # Database errors: invalid parameters, connection issues
 87 |             log_error("Failed to build index", e, quiet=self.quiet)
 88 |             raise
 89 | 
 90 |     def get_collection(self):
 91 |         """Get the collection for search operations.
 92 | 
 93 |         Creates collection if it doesn't exist (for memory system).
 94 | 
 95 |         Returns:
 96 |             Collection instance from abstract interface
 97 | 
 98 |         """
 99 |         try:
100 |             return self._database.get_collection(self.collection_name)
101 |         except (ValueError, RuntimeError):
102 |             # Collection doesn't exist, create it
103 |             return self._database.get_or_create_collection(
104 |                 name=self.collection_name,
105 |                 metadata={"description": f"Collection: {self.collection_name}"}
106 |             )
107 | 
108 |     def get_stats(self) -> Dict[str, Any]:
109 |         """Get database statistics.
110 | 
111 |         Returns:
112 |             Dict[str, Any]: Statistics including chunk count and sources
113 | 
114 |         """
115 |         try:
116 |             collection = self.get_collection()
117 |             count = collection.count()
118 | 
119 |             # Get source distribution
120 |             all_data = collection.get()
121 |             sources = {}
122 |             for meta in all_data["metadatas"]:
123 |                 src = meta["source"]
124 |                 sources[src] = sources.get(src, 0) + 1
125 | 
126 |             return {
127 |                 "total_chunks": count,
128 |                 "sources": sources,
129 |                 "db_path": str(self.db_dir),
130 |             }
131 |         except (ValueError, RuntimeError, OSError) as e:
132 |             # Database not initialized or connection error
133 |             log_error("Database stats unavailable", e, quiet=True)
134 |             return {
135 |                 "error": "Database not found. Run 'python raggy.py build' first to index your documents."
136 |             }
137 | 


--------------------------------------------------------------------------------
/raggy/embeddings/openai_provider.py:
--------------------------------------------------------------------------------
  1 | """OpenAI embedding provider.
  2 | 
  3 | This module provides a cloud-based embedding provider using OpenAI's
  4 | text-embedding models via API.
  5 | """
  6 | 
  7 | from typing import List, Union
  8 | 
  9 | import numpy as np
 10 | 
 11 | from .provider import EmbeddingProvider
 12 | 
 13 | 
 14 | class OpenAIProvider(EmbeddingProvider):
 15 |     """OpenAI embedding provider using text-embedding models.
 16 | 
 17 |     This provider uses OpenAI's API to generate embeddings using models like
 18 |     text-embedding-3-small, text-embedding-3-large, or text-embedding-ada-002.
 19 |     """
 20 | 
 21 |     # Model dimensions (cached to avoid API calls)
 22 |     MODEL_DIMENSIONS = {
 23 |         "text-embedding-3-small": 1536,
 24 |         "text-embedding-3-large": 3072,
 25 |         "text-embedding-ada-002": 1536,
 26 |     }
 27 | 
 28 |     def __init__(self, api_key: str, model: str = "text-embedding-3-small"):
 29 |         """Initialize OpenAI embedding provider.
 30 | 
 31 |         Args:
 32 |             api_key: OpenAI API key
 33 |             model: Model name (text-embedding-3-small, text-embedding-3-large, etc.)
 34 | 
 35 |         Raises:
 36 |             ImportError: If openai package not installed
 37 |             ValueError: If model is not supported
 38 |             RuntimeError: If OpenAI initialization fails
 39 | 
 40 |         """
 41 |         try:
 42 |             from openai import OpenAI
 43 |         except ImportError as e:
 44 |             raise ImportError(
 45 |                 "openai package not installed. "
 46 |                 "Install with: pip install openai"
 47 |             ) from e
 48 | 
 49 |         if model not in self.MODEL_DIMENSIONS:
 50 |             raise ValueError(
 51 |                 f"Unsupported model: {model}. "
 52 |                 f"Supported models: {list(self.MODEL_DIMENSIONS.keys())}"
 53 |             )
 54 | 
 55 |         self.api_key = api_key
 56 |         self.model = model
 57 |         self._dimension = self.MODEL_DIMENSIONS[model]
 58 | 
 59 |         try:
 60 |             self._client = OpenAI(api_key=api_key)
 61 |         except Exception as e:
 62 |             raise RuntimeError(f"Failed to initialize OpenAI client: {e}") from e
 63 | 
 64 |     def encode(
 65 |         self,
 66 |         texts: Union[str, List[str]],
 67 |         batch_size: int = 100,  # OpenAI allows up to 2048 texts per request
 68 |         show_progress: bool = False,
 69 |     ) -> np.ndarray:
 70 |         """Encode text(s) into embeddings using OpenAI API.
 71 | 
 72 |         Args:
 73 |             texts: Single text string or list of texts to encode
 74 |             batch_size: Batch size for API requests (max 2048 for OpenAI)
 75 |             show_progress: Whether to show progress (not implemented for OpenAI)
 76 | 
 77 |         Returns:
 78 |             np.ndarray: Embeddings array of shape (num_texts, embedding_dim)
 79 | 
 80 |         Raises:
 81 |             ValueError: If texts is empty or invalid
 82 |             RuntimeError: If API call fails
 83 | 
 84 |         """
 85 |         if not texts:
 86 |             raise ValueError("texts cannot be empty")
 87 | 
 88 |         # Convert single string to list
 89 |         if isinstance(texts, str):
 90 |             texts = [texts]
 91 | 
 92 |         try:
 93 |             all_embeddings = []
 94 | 
 95 |             # Process in batches
 96 |             for i in range(0, len(texts), batch_size):
 97 |                 batch = texts[i : i + batch_size]
 98 | 
 99 |                 # Call OpenAI API
100 |                 response = self._client.embeddings.create(
101 |                     model=self.model,
102 |                     input=batch,
103 |                 )
104 | 
105 |                 # Extract embeddings from response
106 |                 batch_embeddings = [item.embedding for item in response.data]
107 |                 all_embeddings.extend(batch_embeddings)
108 | 
109 |             # Convert to numpy array
110 |             return np.array(all_embeddings, dtype=np.float32)
111 | 
112 |         except Exception as e:
113 |             # Check for common errors
114 |             error_msg = str(e).lower()
115 |             if "api key" in error_msg or "auth" in error_msg:
116 |                 raise RuntimeError(
117 |                     f"OpenAI authentication failed. Please check your API key: {e}"
118 |                 ) from e
119 |             if "rate limit" in error_msg:
120 |                 raise RuntimeError(
121 |                     f"OpenAI rate limit exceeded. Please try again later: {e}"
122 |                 ) from e
123 |             if "quota" in error_msg:
124 |                 raise RuntimeError(
125 |                     f"OpenAI quota exceeded. Please check your usage: {e}"
126 |                 ) from e
127 |             raise RuntimeError(f"OpenAI API call failed: {e}") from e
128 | 
129 |     def get_dimension(self) -> int:
130 |         """Get the dimension of embeddings.
131 | 
132 |         Returns:
133 |             int: Embedding dimension
134 | 
135 |         """
136 |         return self._dimension
137 | 
138 |     def get_model_name(self) -> str:
139 |         """Get the model name.
140 | 
141 |         Returns:
142 |             str: Model name
143 | 
144 |         """
145 |         return self.model
146 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [project]
  2 | name = "raggy"
  3 | version = "2.0.0"
  4 | description = "Universal ChromaDB RAG Setup Script - Drop-in RAG solution for any project"
  5 | readme = "README.md"
  6 | license = {text = "MIT"}
  7 | authors = [
  8 |     {name = "Raggy Contributors"}
  9 | ]
 10 | keywords = ["rag", "chromadb", "search", "embeddings", "nlp", "machine-learning"]
 11 | classifiers = [
 12 |     "Development Status :: 4 - Beta",
 13 |     "Intended Audience :: Developers",
 14 |     "License :: OSI Approved :: MIT License",
 15 |     "Operating System :: OS Independent",
 16 |     "Programming Language :: Python :: 3",
 17 |     "Programming Language :: Python :: 3.8",
 18 |     "Programming Language :: Python :: 3.9",
 19 |     "Programming Language :: Python :: 3.10",
 20 |     "Programming Language :: Python :: 3.11",
 21 |     "Programming Language :: Python :: 3.12",
 22 |     "Topic :: Scientific/Engineering :: Artificial Intelligence",
 23 |     "Topic :: Software Development :: Libraries :: Python Modules",
 24 |     "Topic :: Text Processing :: General",
 25 | ]
 26 | requires-python = ">=3.8"
 27 | dependencies = [
 28 |     "chromadb>=0.4.0",
 29 |     "sentence-transformers>=2.2.0",
 30 |     "PyPDF2>=3.0.0",
 31 |     "python-docx>=1.0.0",
 32 | ]
 33 | 
 34 | [project.optional-dependencies]
 35 | yaml = ["PyYAML>=6.0"]
 36 | magic-win = ["python-magic-bin>=0.4.14"]
 37 | magic-unix = ["python-magic"]
 38 | 
 39 | # Cloud vector stores
 40 | pinecone = ["pinecone-client>=2.0.0"]
 41 | supabase = ["supabase>=2.0.0"]
 42 | cloud-stores = ["pinecone-client>=2.0.0", "supabase>=2.0.0"]
 43 | 
 44 | # Cloud embedding providers
 45 | openai = ["openai>=1.0.0"]
 46 | cloud-embeddings = ["openai>=1.0.0"]
 47 | 
 48 | # All cloud features
 49 | cloud = ["pinecone-client>=2.0.0", "supabase>=2.0.0", "openai>=1.0.0"]
 50 | 
 51 | # All optional features
 52 | all = [
 53 |     "PyYAML>=6.0",
 54 |     "python-magic-bin>=0.4.14;platform_system=='Windows'",
 55 |     "python-magic;platform_system!='Windows'",
 56 |     "pinecone-client>=2.0.0",
 57 |     "supabase>=2.0.0",
 58 |     "openai>=1.0.0",
 59 | ]
 60 | 
 61 | dev = [
 62 |     "pytest>=7.0.0",
 63 |     "pytest-cov>=4.0.0",
 64 |     "pytest-mock>=3.10.0",
 65 |     "pytest-xdist>=3.0.0",
 66 |     "ruff>=0.1.0",
 67 |     "mypy>=1.5.0",
 68 |     "types-PyYAML",
 69 |     "bandit>=1.7.0",
 70 |     "safety>=2.3.0",
 71 |     "pytest-benchmark>=4.0.0",
 72 | ]
 73 | 
 74 | [project.urls]
 75 | Homepage = "https://github.com/example/raggy"
 76 | Repository = "https://github.com/example/raggy"
 77 | Issues = "https://github.com/example/raggy/issues"
 78 | Documentation = "https://github.com/example/raggy#readme"
 79 | 
 80 | [project.scripts]
 81 | raggy = "raggy_cli:main"
 82 | 
 83 | [build-system]
 84 | requires = ["hatchling"]
 85 | build-backend = "hatchling.build"
 86 | 
 87 | [tool.pytest.ini_options]
 88 | testpaths = ["tests"]
 89 | python_files = ["test_*.py"]
 90 | python_classes = ["Test*"]
 91 | python_functions = ["test_*"]
 92 | addopts = [
 93 |     "--verbose",
 94 |     "--tb=short", 
 95 |     "--strict-markers",
 96 |     "--disable-warnings",
 97 |     "--cov=raggy",
 98 |     "--cov-report=term-missing",
 99 |     "--cov-report=html:htmlcov",
100 |     "--cov-fail-under=85",
101 | ]
102 | markers = [
103 |     "slow: marks tests as slow (deselect with '-m \"not slow\"')",
104 |     "integration: marks tests as integration tests",
105 |     "unit: marks tests as unit tests", 
106 |     "security: marks tests as security-focused tests",
107 | ]
108 | filterwarnings = [
109 |     "ignore::DeprecationWarning",
110 |     "ignore::PendingDeprecationWarning",
111 | ]
112 | 
113 | [tool.coverage.run]
114 | source = ["raggy.py"]
115 | omit = [
116 |     "tests/*",
117 |     ".*/*",
118 | ]
119 | 
120 | [tool.coverage.report]
121 | exclude_lines = [
122 |     "pragma: no cover",
123 |     "def __repr__",
124 |     "if self.debug:",
125 |     "if settings.DEBUG",
126 |     "raise AssertionError",
127 |     "raise NotImplementedError",
128 |     "if 0:",
129 |     "if __name__ == .__main__.:",
130 |     "class .*\\bProtocol\\):",
131 |     "@(abc\\.)?abstractmethod",
132 | ]
133 | 
134 | [tool.ruff]
135 | target-version = "py38"
136 | line-length = 88
137 | select = [
138 |     "E",   # pycodestyle errors
139 |     "W",   # pycodestyle warnings
140 |     "F",   # pyflakes
141 |     "I",   # isort
142 |     "B",   # flake8-bugbear
143 |     "C4",  # flake8-comprehensions
144 |     "UP",  # pyupgrade
145 |     "PIE", # flake8-pie
146 |     "SIM", # flake8-simplify
147 |     "RET", # flake8-return
148 |     "TCH", # flake8-type-checking
149 | ]
150 | ignore = [
151 |     "E501",   # line too long, handled by formatter
152 |     "B008",   # do not perform function calls in argument defaults
153 |     "B904",   # use raise from within except clause
154 |     "RET505", # unnecessary else after return
155 |     "RET508", # unnecessary else after break
156 |     "SIM108", # use ternary operator instead of if-else
157 |     "UP007",  # use X | Y for type annotations (Python 3.8 compatibility)
158 | ]
159 | 
160 | [tool.ruff.format]
161 | quote-style = "double"
162 | indent-style = "space"
163 | skip-magic-trailing-comma = false
164 | line-ending = "auto"
165 | 
166 | [tool.ruff.isort]
167 | known-first-party = ["raggy"]
168 | 
169 | [tool.mypy]
170 | python_version = "3.8"
171 | warn_return_any = true
172 | warn_unused_configs = true
173 | disallow_untyped_defs = false  # Gradual typing
174 | disallow_incomplete_defs = false
175 | check_untyped_defs = true
176 | disallow_untyped_decorators = false
177 | no_implicit_optional = true
178 | warn_redundant_casts = true
179 | warn_unused_ignores = true
180 | warn_no_return = true
181 | warn_unreachable = true
182 | strict_equality = true
183 | ignore_missing_imports = true
184 | 
185 | [tool.bandit]
186 | exclude_dirs = ["tests"]
187 | skips = ["B101"]  # Skip test for use of assert
188 | 
189 | [tool.bandit.assert_used]
190 | skips = ["*_test.py", "test_*.py"]


--------------------------------------------------------------------------------
/raggy/utils/updates.py:
--------------------------------------------------------------------------------
  1 | """Update checking utilities for version management."""
  2 | 
  3 | import json
  4 | import time
  5 | from pathlib import Path
  6 | from typing import Any, Dict, Optional
  7 | 
  8 | from .logging import log_warning
  9 | 
 10 | # Version information
 11 | __version__ = "2.0.0"
 12 | 
 13 | # Constants
 14 | SESSION_CACHE_HOURS = 24  # Hours before update check
 15 | UPDATE_TIMEOUT_SECONDS = 2  # API timeout for update checks
 16 | 
 17 | 
 18 | class UpdateChecker:
 19 |     """Handles version update checks with session caching."""
 20 | 
 21 |     def __init__(self, config: Optional[Dict[str, Any]] = None):
 22 |         """Initialize update checker.
 23 | 
 24 |         Args:
 25 |             config: Optional configuration dictionary with update settings
 26 | 
 27 |         """
 28 |         self.config = config or {}
 29 |         self.updates_config = self.config.get("updates", {})
 30 |         self.session_file = Path.home() / ".raggy_session"
 31 |         self.github_repo = self.updates_config.get("github_repo", "dimitritholen/raggy")
 32 | 
 33 |     def check(self, quiet: bool = False) -> None:
 34 |         """Check GitHub for latest version once per session.
 35 | 
 36 |         Args:
 37 |             quiet: If True, suppress output
 38 | 
 39 |         """
 40 |         if not self._should_check(quiet):
 41 |             return
 42 | 
 43 |         latest_version = self._fetch_latest_version()
 44 |         if latest_version and self._is_newer(latest_version):
 45 |             self._display_update_notice(latest_version)
 46 | 
 47 |         self._update_session_cache()
 48 | 
 49 |     def _should_check(self, quiet: bool) -> bool:
 50 |         """Determine if update check should run.
 51 | 
 52 |         Args:
 53 |             quiet: If True, check should not run
 54 | 
 55 |         Returns:
 56 |             bool: True if check should proceed
 57 | 
 58 |         """
 59 |         if quiet:
 60 |             return False
 61 | 
 62 |         if not self.updates_config.get("check_enabled", True):
 63 |             return False
 64 | 
 65 |         return not self._is_recently_checked()
 66 | 
 67 |     def _is_recently_checked(self) -> bool:
 68 |         """Check if update was checked in last 24 hours.
 69 | 
 70 |         Returns:
 71 |             bool: True if recently checked
 72 | 
 73 |         """
 74 |         if not self.session_file.exists():
 75 |             return False
 76 | 
 77 |         try:
 78 |             cache_age = time.time() - self.session_file.stat().st_mtime
 79 |             return cache_age < SESSION_CACHE_HOURS * 3600
 80 |         except (OSError, AttributeError) as e:
 81 |             log_warning(
 82 |                 f"Could not read session file {self.session_file.name}, treating as expired",
 83 |                 e,
 84 |                 quiet=True
 85 |             )
 86 |             return False
 87 | 
 88 |     def _fetch_latest_version(self) -> Optional[str]:
 89 |         """Fetch latest version from GitHub API.
 90 | 
 91 |         Returns:
 92 |             Optional[str]: Latest version string or None if fetch fails
 93 | 
 94 |         """
 95 |         try:
 96 |             import urllib.error
 97 |             import urllib.request
 98 | 
 99 |             api_url = f"https://api.github.com/repos/{self.github_repo}/releases/latest"
100 | 
101 |             with urllib.request.urlopen(api_url, timeout=UPDATE_TIMEOUT_SECONDS) as response:
102 |                 if response.status == 200:
103 |                     data = json.loads(response.read().decode('utf-8'))
104 |                     latest_version = data.get("tag_name", "").lstrip("v")
105 |                     if latest_version:
106 |                         self._cached_release_url = data.get("html_url")
107 |                         return latest_version
108 | 
109 |         except (
110 |             urllib.error.URLError,
111 |             urllib.error.HTTPError,
112 |             json.JSONDecodeError,
113 |             ConnectionError,
114 |             TimeoutError,
115 |             Exception
116 |         ):
117 |             # Silently fail - don't interrupt user workflow
118 |             pass
119 | 
120 |         return None
121 | 
122 |     def _is_newer(self, latest_version: str) -> bool:
123 |         """Check if latest version is newer than current.
124 | 
125 |         Args:
126 |             latest_version: Version string to compare
127 | 
128 |         Returns:
129 |             bool: True if latest version is different from current
130 | 
131 |         """
132 |         return latest_version != __version__
133 | 
134 |     def _display_update_notice(self, latest_version: str) -> None:
135 |         """Display update notification to user.
136 | 
137 |         Args:
138 |             latest_version: Version string to display
139 | 
140 |         """
141 |         github_url = getattr(self, '_cached_release_url', None)
142 |         if not github_url:
143 |             base_url = f"https://github.com/{self.github_repo}"
144 |             github_url = f"{base_url}/releases/latest"
145 | 
146 |         print(f"📦 Raggy update available: v{latest_version} → {github_url}")
147 | 
148 |     def _update_session_cache(self) -> None:
149 |         """Update session file to mark check as done."""
150 |         try:
151 |             self.session_file.touch()
152 |         except (OSError, PermissionError) as e:
153 |             log_warning(
154 |                 f"Could not create session file {self.session_file.name}, update check will run again on next startup",
155 |                 e,
156 |                 quiet=True
157 |             )
158 | 
159 | 
160 | def check_for_updates(
161 |     quiet: bool = False, config: Optional[Dict[str, Any]] = None
162 | ) -> None:
163 |     """Check GitHub for latest version once per session (non-intrusive).
164 | 
165 |     Args:
166 |         quiet: If True, suppress output
167 |         config: Optional configuration dictionary with update settings
168 | 
169 |     """
170 |     checker = UpdateChecker(config)
171 |     checker.check(quiet)
172 | 


--------------------------------------------------------------------------------
/raggy/core/vector_store_factory.py:
--------------------------------------------------------------------------------
  1 | """Factory for creating vector store adapters based on configuration."""
  2 | 
  3 | from typing import Any, Dict
  4 | 
  5 | from .chromadb_adapter import ChromaDBAdapter
  6 | from .database_interface import VectorDatabase
  7 | 
  8 | 
  9 | def create_vector_store(config: Dict[str, Any]) -> VectorDatabase:
 10 |     """Create a vector store adapter based on configuration.
 11 | 
 12 |     Args:
 13 |         config: Vector store configuration dictionary with structure:
 14 |             {
 15 |                 "provider": "chromadb" | "pinecone" | "supabase",
 16 |                 "chromadb": {"path": "..."},
 17 |                 "pinecone": {"apiKey": "...", "environment": "...", "indexName": "..."},
 18 |                 "supabase": {"url": "...", "apiKey": "...", "tableName": "..."}
 19 |             }
 20 | 
 21 |     Returns:
 22 |         VectorDatabase: Configured vector store adapter instance
 23 | 
 24 |     Raises:
 25 |         ValueError: If provider is unknown or configuration is invalid
 26 |         RuntimeError: If adapter initialization fails
 27 | 
 28 |     Example:
 29 |         >>> config = {
 30 |         ...     "provider": "chromadb",
 31 |         ...     "chromadb": {
 32 |         ...         "path": "./vectordb"
 33 |         ...     }
 34 |         ... }
 35 |         >>> vector_store = create_vector_store(config)
 36 | 
 37 |     """
 38 |     provider_type = config.get("provider", "chromadb")
 39 | 
 40 |     if provider_type == "chromadb":
 41 |         chromadb_config = config.get("chromadb", {})
 42 |         path = chromadb_config.get("path", "./vectordb")
 43 | 
 44 |         return ChromaDBAdapter(path=path)
 45 | 
 46 |     elif provider_type == "pinecone":
 47 |         try:
 48 |             from .pinecone_adapter import PineconeAdapter
 49 |         except ImportError as e:
 50 |             raise ImportError(
 51 |                 "Pinecone adapter requires pinecone. "
 52 |                 "Install with: pip install pinecone"
 53 |             ) from e
 54 | 
 55 |         pinecone_config = config.get("pinecone", {})
 56 | 
 57 |         if not pinecone_config:
 58 |             raise ValueError(
 59 |                 "Pinecone configuration missing. Please provide 'pinecone' config with "
 60 |                 "'apiKey', 'cloud', 'region', and 'indexName'."
 61 |             )
 62 | 
 63 |         api_key = pinecone_config.get("apiKey")
 64 |         if not api_key:
 65 |             raise ValueError(
 66 |                 "Pinecone API key missing. Please set 'vectorStore.pinecone.apiKey' in .raggy.json "
 67 |                 "or use environment variable: ${PINECONE_API_KEY}"
 68 |             )
 69 | 
 70 |         # Handle backward compatibility: parse old "environment" format
 71 |         # Old format: "us-east-1-aws" -> region: "us-east-1", cloud: "aws"
 72 |         # New format: separate "cloud" and "region" fields
 73 |         environment = pinecone_config.get("environment")
 74 |         if environment:
 75 |             # Old format detected - parse it
 76 |             # Cloud providers are: aws, gcp, azure (not numeric)
 77 |             parts = environment.rsplit('-', 1)
 78 |             if len(parts) == 2 and parts[1] in ('aws', 'gcp', 'azure'):
 79 |                 # Valid old format: "us-east-1-aws"
 80 |                 region = parts[0]
 81 |                 cloud = parts[1]
 82 |             else:
 83 |                 # No cloud suffix or invalid suffix - treat whole string as region
 84 |                 region = environment
 85 |                 cloud = "aws"
 86 |         else:
 87 |             # New format - use explicit cloud and region
 88 |             cloud = pinecone_config.get("cloud", "aws")
 89 |             region = pinecone_config.get("region")
 90 |             if not region:
 91 |                 raise ValueError(
 92 |                     "Pinecone region missing. Please set 'vectorStore.pinecone.region' "
 93 |                     "(e.g., 'us-east-1') or use legacy 'environment' format (e.g., 'us-east-1-aws')"
 94 |                 )
 95 | 
 96 |         index_name = pinecone_config.get("indexName", "raggy-index")
 97 |         dimension = pinecone_config.get("dimension", 384)
 98 | 
 99 |         return PineconeAdapter(
100 |             api_key=api_key,
101 |             index_name=index_name,
102 |             dimension=dimension,
103 |             cloud=cloud,
104 |             region=region,
105 |         )
106 | 
107 |     elif provider_type == "supabase":
108 |         try:
109 |             from .supabase_adapter import SupabaseAdapter
110 |         except ImportError as e:
111 |             raise ImportError(
112 |                 "Supabase adapter requires supabase package. "
113 |                 "Install with: pip install supabase"
114 |             ) from e
115 | 
116 |         supabase_config = config.get("supabase", {})
117 | 
118 |         if not supabase_config:
119 |             raise ValueError(
120 |                 "Supabase configuration missing. Please provide 'supabase' config with "
121 |                 "'url' and 'apiKey'."
122 |             )
123 | 
124 |         url = supabase_config.get("url")
125 |         if not url:
126 |             raise ValueError(
127 |                 "Supabase URL missing. Please set 'vectorStore.supabase.url' in .raggy.json "
128 |                 "or use environment variable: ${SUPABASE_URL}"
129 |             )
130 | 
131 |         api_key = supabase_config.get("apiKey")
132 |         if not api_key:
133 |             raise ValueError(
134 |                 "Supabase API key missing. Please set 'vectorStore.supabase.apiKey' in .raggy.json "
135 |                 "or use environment variable: ${SUPABASE_ANON_KEY}"
136 |             )
137 | 
138 |         dimension = supabase_config.get("dimension", 384)
139 | 
140 |         return SupabaseAdapter(
141 |             url=url,
142 |             api_key=api_key,
143 |             dimension=dimension,
144 |         )
145 | 
146 |     else:
147 |         raise ValueError(
148 |             f"Unknown vector store provider: {provider_type}. "
149 |             f"Supported providers: chromadb, pinecone, supabase"
150 |         )
151 | 


--------------------------------------------------------------------------------
/raggy/query/processor.py:
--------------------------------------------------------------------------------
  1 | """Query processing and expansion functionality."""
  2 | 
  3 | from typing import Any, Dict, List, Optional, Tuple
  4 | 
  5 | from ..utils.patterns import (
  6 |     AND_TERM_PATTERN,
  7 |     NEGATIVE_TERM_PATTERN,
  8 |     QUOTED_PHRASE_PATTERN,
  9 |     WORD_PATTERN,
 10 | )
 11 | 
 12 | 
 13 | class QueryProcessor:
 14 |     """Enhanced query processing with expansion and operators.
 15 | 
 16 |     Handles:
 17 |     - Query expansion with synonyms
 18 |     - Exact phrase matching (quoted strings)
 19 |     - Boolean operators (AND, OR, NOT)
 20 |     - Query type detection
 21 |     """
 22 | 
 23 |     def __init__(
 24 |         self, custom_expansions: Optional[Dict[str, List[str]]] = None
 25 |     ) -> None:
 26 |         """Initialize query processor with optional custom expansions.
 27 | 
 28 |         Args:
 29 |             custom_expansions: Optional dictionary of term expansions
 30 | 
 31 |         """
 32 |         # Default expansions - can be overridden via config
 33 |         self.expansions = custom_expansions or {
 34 |             # Common technical terms
 35 |             "api": ["api", "application programming interface"],
 36 |             "ml": ["ml", "machine learning"],
 37 |             "ai": ["ai", "artificial intelligence"],
 38 |             "ui": ["ui", "user interface"],
 39 |             "ux": ["ux", "user experience"],
 40 |             # Can be extended via configuration file
 41 |         }
 42 | 
 43 |     def process(self, query: str) -> Dict[str, Any]:
 44 |         """Process query and return enhanced version with metadata.
 45 | 
 46 |         Args:
 47 |             query: Raw query string
 48 | 
 49 |         Returns:
 50 |             Dict containing:
 51 |                 - processed: Enhanced query string
 52 |                 - original: Original query
 53 |                 - type: Query type (exact, question, boolean, keyword)
 54 |                 - boost_exact: Whether to boost exact matches
 55 |                 - must_have: List of required terms
 56 |                 - must_not: List of excluded terms
 57 |                 - terms: List of query terms
 58 | 
 59 |         """
 60 |         # Preserve original query exactly as provided
 61 |         original = query
 62 |         # Use cleaned version for processing
 63 |         cleaned = query.strip()
 64 | 
 65 |         # Detect query type
 66 |         query_type = self._detect_type(cleaned)
 67 | 
 68 |         # Handle exact phrase queries (quoted)
 69 |         if query_type == "exact":
 70 |             # Defensively check if pattern found valid quoted phrase
 71 |             matches = QUOTED_PHRASE_PATTERN.findall(cleaned)
 72 |             if matches:
 73 |                 phrase = matches[0]
 74 |                 return {
 75 |                     "processed": phrase,
 76 |                     "original": original,
 77 |                     "type": "exact",
 78 |                     "boost_exact": True,
 79 |                     "terms": [phrase],
 80 |                 }
 81 |             # Handle empty quotes case
 82 |             elif '""' in cleaned:
 83 |                 return {
 84 |                     "processed": "",
 85 |                     "original": original,
 86 |                     "type": "exact",
 87 |                     "boost_exact": True,
 88 |                     "terms": [],
 89 |                 }
 90 |             # If no valid match found, fall back to keyword search
 91 |             query_type = "keyword"
 92 | 
 93 |         # Expand terms
 94 |         expanded = self._expand_query(cleaned)
 95 | 
 96 |         # Extract boolean operators
 97 |         must_have, must_not = self._extract_operators(expanded)
 98 | 
 99 |         return {
100 |             "processed": expanded,
101 |             "original": original,
102 |             "type": query_type,
103 |             "boost_exact": False,
104 |             "must_have": must_have,
105 |             "must_not": must_not,
106 |             "terms": WORD_PATTERN.findall(expanded.lower()),
107 |         }
108 | 
109 |     def _detect_type(self, query: str) -> str:
110 |         """Detect query type from content.
111 | 
112 |         Args:
113 |             query: Query string
114 | 
115 |         Returns:
116 |             str: Query type (exact, question, boolean, or keyword)
117 | 
118 |         """
119 |         # Check for valid quoted phrases (including empty quotes "")
120 |         # Pattern matches non-empty quotes, but we also check for paired empty quotes
121 |         if QUOTED_PHRASE_PATTERN.findall(query) or '""' in query:
122 |             return "exact"
123 | 
124 |         question_words = ["how", "what", "why", "when", "where", "who"]
125 |         if any(word in query.lower() for word in question_words):
126 |             return "question"
127 | 
128 |         boolean_operators = [" AND ", " OR ", " -"]
129 |         query_upper = query.upper()
130 |         if any(op in query_upper or op.strip() in query for op in boolean_operators):
131 |             return "boolean"
132 | 
133 |         return "keyword"
134 | 
135 |     def _expand_query(self, query: str) -> str:
136 |         """Expand query with synonyms.
137 | 
138 |         Args:
139 |             query: Query string
140 | 
141 |         Returns:
142 |             str: Expanded query with OR clauses for synonyms
143 | 
144 |         """
145 |         expanded = query.lower()
146 |         for term, expansions in self.expansions.items():
147 |             if term in expanded:
148 |                 # Add expansions as OR terms
149 |                 expansion_str = " OR ".join(expansions[1:])  # Skip the original term
150 |                 if expansion_str:
151 |                     expanded = expanded.replace(term, f"({term} OR {expansion_str})")
152 |         return expanded
153 | 
154 |     def _extract_operators(self, query: str) -> Tuple[List[str], List[str]]:
155 |         """Extract boolean operators from query.
156 | 
157 |         Args:
158 |             query: Query string
159 | 
160 |         Returns:
161 |             Tuple[List[str], List[str]]: (must_have_terms, must_not_terms)
162 | 
163 |         """
164 |         must_have = []
165 |         must_not = []
166 | 
167 |         # Extract negative terms (preceded by -)
168 |         negative_terms = NEGATIVE_TERM_PATTERN.findall(query)
169 |         for term in negative_terms:
170 |             must_not.append(term[1:])  # Remove the -
171 | 
172 |         # Extract AND terms
173 |         and_terms = AND_TERM_PATTERN.findall(query)
174 |         must_have.extend(and_terms)
175 | 
176 |         return must_have, must_not
177 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
  1 | name: Test and Quality Check
  2 | 
  3 | on:
  4 |   push:
  5 |     branches: [ main, develop ]
  6 |   pull_request:
  7 |     branches: [ main ]
  8 | 
  9 | jobs:
 10 |   test:
 11 |     name: Test Suite
 12 |     runs-on: ubuntu-latest
 13 |     strategy:
 14 |       matrix:
 15 |         python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"]
 16 | 
 17 |     steps:
 18 |     - uses: actions/checkout@v4
 19 | 
 20 |     - name: Install uv
 21 |       uses: astral-sh/setup-uv@v2
 22 |       with:
 23 |         version: "latest"
 24 | 
 25 |     - name: Set up Python ${{ matrix.python-version }}
 26 |       run: uv python install ${{ matrix.python-version }}
 27 | 
 28 |     - name: Create virtual environment
 29 |       run: uv venv --python ${{ matrix.python-version }}
 30 | 
 31 |     - name: Install dependencies
 32 |       run: |
 33 |         uv pip install --requirement requirements-dev.txt
 34 |         uv pip install pytest pytest-cov pytest-mock bandit safety mypy ruff
 35 |         uv pip install chromadb>=0.4.0
 36 |         uv pip install sentence-transformers>=2.2.0
 37 |         uv pip install PyPDF2>=3.0.0
 38 |         uv pip install python-docx>=1.0.0
 39 | 
 40 |     - name: Run raggy self-tests
 41 |       run: |
 42 |         source .venv/bin/activate
 43 |         python raggy.py test
 44 | 
 45 |     - name: Run pytest
 46 |       run: |
 47 |         source .venv/bin/activate
 48 |         pytest tests/ --cov=raggy --cov-report=xml --cov-report=term-missing -v
 49 | 
 50 |     - name: Upload coverage to Codecov
 51 |       uses: codecov/codecov-action@v3
 52 |       with:
 53 |         file: ./coverage.xml
 54 |         flags: unittests
 55 |         name: codecov-umbrella
 56 | 
 57 |   security:
 58 |     name: Security Scan
 59 |     runs-on: ubuntu-latest
 60 |     steps:
 61 |     - uses: actions/checkout@v4
 62 | 
 63 |     - name: Install uv
 64 |       uses: astral-sh/setup-uv@v2
 65 | 
 66 |     - name: Set up Python 3.11
 67 |       run: uv python install 3.11
 68 | 
 69 |     - name: Create virtual environment
 70 |       run: uv venv --python 3.11
 71 | 
 72 |     - name: Install security tools
 73 |       run: |
 74 |         uv pip install bandit safety
 75 | 
 76 |     - name: Run Bandit security scan
 77 |       run: |
 78 |         source .venv/bin/activate
 79 |         bandit -r raggy.py -f json -o bandit-report.json || true
 80 |         bandit -r raggy.py
 81 | 
 82 |     - name: Run Safety check
 83 |       run: |
 84 |         source .venv/bin/activate
 85 |         uv pip freeze | safety check --json --output safety-report.json || true
 86 |         uv pip freeze | safety check
 87 | 
 88 |   lint:
 89 |     name: Code Quality
 90 |     runs-on: ubuntu-latest
 91 |     steps:
 92 |     - uses: actions/checkout@v4
 93 | 
 94 |     - name: Install uv
 95 |       uses: astral-sh/setup-uv@v2
 96 | 
 97 |     - name: Set up Python 3.11
 98 |       run: uv python install 3.11
 99 | 
100 |     - name: Create virtual environment
101 |       run: uv venv --python 3.11
102 | 
103 |     - name: Install linting tools
104 |       run: |
105 |         uv pip install ruff mypy types-PyYAML
106 | 
107 |     - name: Run Ruff linter
108 |       run: |
109 |         source .venv/bin/activate
110 |         ruff check raggy.py --output-format=github
111 | 
112 |     - name: Run Ruff formatter check
113 |       run: |
114 |         source .venv/bin/activate
115 |         ruff format --check raggy.py
116 | 
117 |     - name: Run MyPy type checker
118 |       run: |
119 |         source .venv/bin/activate
120 |         mypy raggy.py --ignore-missing-imports || true
121 | 
122 |   performance:
123 |     name: Performance Test
124 |     runs-on: ubuntu-latest
125 |     steps:
126 |     - uses: actions/checkout@v4
127 | 
128 |     - name: Install uv
129 |       uses: astral-sh/setup-uv@v2
130 | 
131 |     - name: Set up Python 3.11
132 |       run: uv python install 3.11
133 | 
134 |     - name: Create virtual environment
135 |       run: uv venv --python 3.11
136 | 
137 |     - name: Install dependencies
138 |       run: |
139 |         uv pip install chromadb>=0.4.0
140 |         uv pip install sentence-transformers>=2.2.0
141 |         uv pip install PyPDF2>=3.0.0
142 |         uv pip install python-docx>=1.0.0
143 | 
144 |     - name: Create test documents
145 |       run: |
146 |         mkdir -p test_docs
147 |         echo "# Test Document 1" > test_docs/doc1.md
148 |         echo "This is test content for performance testing." >> test_docs/doc1.md
149 |         echo "# Test Document 2" > test_docs/doc2.md
150 |         echo "More test content with different keywords and phrases." >> test_docs/doc2.md
151 | 
152 |     - name: Run performance benchmark
153 |       run: |
154 |         source .venv/bin/activate
155 |         python raggy.py --docs-dir test_docs build
156 |         python raggy.py --docs-dir test_docs optimize
157 | 
158 |   integration:
159 |     name: Integration Test
160 |     runs-on: ubuntu-latest
161 |     steps:
162 |     - uses: actions/checkout@v4
163 | 
164 |     - name: Install uv
165 |       uses: astral-sh/setup-uv@v2
166 | 
167 |     - name: Set up Python 3.11
168 |       run: uv python install 3.11
169 | 
170 |     - name: Create virtual environment
171 |       run: uv venv --python 3.11
172 | 
173 |     - name: Install dependencies
174 |       run: |
175 |         uv pip install chromadb>=0.4.0
176 |         uv pip install sentence-transformers>=2.2.0
177 |         uv pip install PyPDF2>=3.0.0
178 |         uv pip install python-docx>=1.0.0
179 | 
180 |     - name: Run system diagnostics
181 |       run: |
182 |         source .venv/bin/activate
183 |         python raggy.py diagnose
184 | 
185 |     - name: Run configuration validation
186 |       run: |
187 |         source .venv/bin/activate
188 |         python raggy.py validate
189 | 
190 |     - name: Test full workflow
191 |       run: |
192 |         source .venv/bin/activate
193 |         mkdir -p integration_test_docs
194 |         echo "# Integration Test Document" > integration_test_docs/integration.md
195 |         echo "This document tests the full raggy workflow from indexing to search." >> integration_test_docs/integration.md
196 |         echo "It includes various terms for search testing: machine learning, API, database." >> integration_test_docs/integration.md
197 |         
198 |         python raggy.py --docs-dir integration_test_docs build
199 |         python raggy.py --docs-dir integration_test_docs search "machine learning"
200 |         python raggy.py --docs-dir integration_test_docs search "API" --hybrid
201 |         python raggy.py --docs-dir integration_test_docs status


--------------------------------------------------------------------------------
/raggy.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """Universal ChromaDB RAG Setup Script v2.0.0 - DEPRECATED WRAPPER.
  3 | 
  4 | ⚠️  DEPRECATION NOTICE:
  5 | ================================================================================
  6 | This monolithic raggy.py file is DEPRECATED and will be removed in v3.0.0.
  7 | 
  8 | Please use one of these alternatives:
  9 |   - Recommended: python raggy_cli.py [command]
 10 |   - As module: python -m raggy [command]
 11 |   - Installed: raggy [command]
 12 | 
 13 | This file now acts as a thin wrapper for backward compatibility only.
 14 | All functionality has been refactored into the modular raggy/ package.
 15 | ================================================================================
 16 | 
 17 | Original features preserved through the raggy package:
 18 | • Hybrid Search: Combines semantic + BM25 keyword ranking for exact matches
 19 | • Smart Chunking: Markdown-aware chunking preserving document structure
 20 | • Normalized Scoring: 0-1 scores with human-readable labels
 21 | • Query Processing: Automatic expansion of domain terms
 22 | • Model Presets: --model-preset fast/balanced/multilingual/accurate
 23 | • Config Support: Optional raggy_config.yaml for customization
 24 | • Multilingual: Enhanced Dutch/English mixed content support
 25 | • Backward Compatible: All v1.x commands work unchanged
 26 | """
 27 | 
 28 | import sys
 29 | import warnings
 30 | 
 31 | # Show deprecation warning when this file is executed
 32 | warnings.warn(
 33 |     "\n" + "="*80 + "\n"
 34 |     "⚠️  raggy.py is DEPRECATED and will be removed in v3.0.0.\n"
 35 |     "Please use 'python raggy_cli.py' or 'python -m raggy' instead.\n"
 36 |     "This wrapper exists only for backward compatibility.\n" +
 37 |     "="*80,
 38 |     DeprecationWarning,
 39 |     stacklevel=2
 40 | )
 41 | 
 42 | # ============================================================================
 43 | # IMPORTS FROM REFACTORED RAGGY PACKAGE
 44 | # All functionality now lives in the modular raggy/ package
 45 | # ============================================================================
 46 | 
 47 | # Core functionality
 48 | from raggy import (
 49 |     UniversalRAG,
 50 |     SearchEngine,
 51 |     DatabaseManager,
 52 |     DocumentProcessor,
 53 |     BM25Scorer,
 54 |     QueryProcessor,
 55 |     CommandFactory,
 56 |     __version__,
 57 | )
 58 | 
 59 | # Configuration and setup
 60 | from raggy import (
 61 |     load_config,
 62 |     setup_environment,
 63 |     setup_dependencies,
 64 |     install_if_missing,
 65 |     check_for_updates,
 66 | )
 67 | 
 68 | # Scoring and normalization functions
 69 | from raggy import (
 70 |     normalize_cosine_distance,
 71 |     normalize_hybrid_score,
 72 |     interpret_score,
 73 | )
 74 | 
 75 | # Command implementations
 76 | from raggy.cli.commands import (
 77 |     InitCommand,
 78 |     BuildCommand,
 79 |     SearchCommand,
 80 |     InteractiveCommand,
 81 |     StatusCommand,
 82 |     OptimizeCommand,
 83 |     TestCommand,
 84 |     DiagnoseCommand,
 85 |     ValidateCommand,
 86 | )
 87 | 
 88 | # Utility functions
 89 | from raggy.utils.logging import (
 90 |     log_error,
 91 |     log_warning,
 92 |     handle_file_error,
 93 | )
 94 | 
 95 | from raggy.utils.security import (
 96 |     validate_path,
 97 |     sanitize_error_message,
 98 | )
 99 | 
100 | from raggy.utils.symbols import (
101 |     get_symbols,
102 |     SYMBOLS,
103 | )
104 | 
105 | # Cache utilities
106 | from raggy.config.cache import (
107 |     get_cache_file,
108 |     load_deps_cache,
109 |     save_deps_cache,
110 | )
111 | 
112 | # Constants - re-export for backward compatibility
113 | from raggy.config.constants import (
114 |     CHUNK_READ_SIZE,
115 |     MAX_CACHE_SIZE,
116 |     CACHE_TTL,
117 |     MAX_FILE_SIZE_MB,
118 |     SESSION_CACHE_HOURS,
119 |     UPDATE_TIMEOUT_SECONDS,
120 |     DEFAULT_CHUNK_SIZE,
121 |     DEFAULT_CHUNK_OVERLAP,
122 |     DEFAULT_RESULTS,
123 |     DEFAULT_CONTEXT_CHARS,
124 |     DEFAULT_HYBRID_WEIGHT,
125 |     SUPPORTED_EXTENSIONS,
126 |     GLOB_PATTERNS,
127 |     FAST_MODEL,
128 |     DEFAULT_MODEL,
129 |     MULTILINGUAL_MODEL,
130 |     ACCURATE_MODEL,
131 | )
132 | 
133 | # ============================================================================
134 | # BACKWARD COMPATIBILITY EXPORTS
135 | # Re-export everything for scripts that import from raggy
136 | # ============================================================================
137 | 
138 | __all__ = [
139 |     # Core classes
140 |     "UniversalRAG",
141 |     "SearchEngine",
142 |     "DatabaseManager",
143 |     "DocumentProcessor",
144 |     "BM25Scorer",
145 |     "QueryProcessor",
146 |     "CommandFactory",
147 | 
148 |     # Commands
149 |     "InitCommand",
150 |     "BuildCommand",
151 |     "SearchCommand",
152 |     "InteractiveCommand",
153 |     "StatusCommand",
154 |     "OptimizeCommand",
155 |     "TestCommand",
156 |     "DiagnoseCommand",
157 |     "ValidateCommand",
158 | 
159 |     # Functions
160 |     "load_config",
161 |     "setup_environment",
162 |     "setup_dependencies",
163 |     "install_if_missing",
164 |     "check_for_updates",
165 |     "normalize_cosine_distance",
166 |     "normalize_hybrid_score",
167 |     "interpret_score",
168 |     "log_error",
169 |     "log_warning",
170 |     "handle_file_error",
171 |     "validate_path",
172 |     "sanitize_error_message",
173 |     "get_symbols",
174 |     "get_cache_file",
175 |     "load_deps_cache",
176 |     "save_deps_cache",
177 | 
178 |     # Constants
179 |     "CHUNK_READ_SIZE",
180 |     "MAX_CACHE_SIZE",
181 |     "CACHE_TTL",
182 |     "MAX_FILE_SIZE_MB",
183 |     "SESSION_CACHE_HOURS",
184 |     "UPDATE_TIMEOUT_SECONDS",
185 |     "DEFAULT_CHUNK_SIZE",
186 |     "DEFAULT_CHUNK_OVERLAP",
187 |     "DEFAULT_RESULTS",
188 |     "DEFAULT_CONTEXT_CHARS",
189 |     "DEFAULT_HYBRID_WEIGHT",
190 |     "SUPPORTED_EXTENSIONS",
191 |     "GLOB_PATTERNS",
192 |     "FAST_MODEL",
193 |     "DEFAULT_MODEL",
194 |     "MULTILINGUAL_MODEL",
195 |     "ACCURATE_MODEL",
196 |     "SYMBOLS",
197 | 
198 |     # Version
199 |     "__version__",
200 | ]
201 | 
202 | # ============================================================================
203 | # MAIN ENTRY POINT - Delegates to raggy_cli
204 | # ============================================================================
205 | 
206 | def parse_args():
207 |     """Legacy parse_args function - delegates to raggy_cli."""
208 |     # Import here to avoid circular dependency
209 |     from raggy_cli import parse_args as cli_parse_args
210 |     return cli_parse_args()
211 | 
212 | 
213 | def main():
214 |     """Legacy entry point - delegates to raggy_cli.py implementation.
215 | 
216 |     This function exists only for backward compatibility.
217 |     New users should use raggy_cli.py directly.
218 |     """
219 |     # Show another warning when main is called
220 |     print("\n" + "="*80, file=sys.stderr)
221 |     print("⚠️  NOTE: You are using the deprecated raggy.py wrapper.", file=sys.stderr)
222 |     print("   Please switch to: python raggy_cli.py [command]", file=sys.stderr)
223 |     print("   This wrapper will be removed in version 3.0.0", file=sys.stderr)
224 |     print("="*80 + "\n", file=sys.stderr)
225 | 
226 |     # Delegate to the new CLI implementation
227 |     from raggy_cli import main as cli_main
228 |     cli_main()
229 | 
230 | 
231 | # Legacy helper function for backward compatibility
232 | def _determine_model(args):
233 |     """Legacy model determination - delegates to raggy_cli."""
234 |     from raggy_cli import _determine_model as cli_determine_model
235 |     return cli_determine_model(args)
236 | 
237 | 
238 | # ============================================================================
239 | # SCRIPT ENTRY POINT
240 | # ============================================================================
241 | 
242 | if __name__ == "__main__":
243 |     main()


--------------------------------------------------------------------------------
/raggy/core/database_interface.py:
--------------------------------------------------------------------------------
  1 | """Abstract interface for vector database operations.
  2 | 
  3 | This module defines the abstract base classes that all vector database
  4 | implementations must follow, enabling dependency inversion and allowing
  5 | multiple database backends.
  6 | """
  7 | 
  8 | from abc import ABC, abstractmethod
  9 | from typing import Any, Dict, List, Optional
 10 | 
 11 | 
 12 | class VectorDatabase(ABC):
 13 |     """Abstract interface for vector database operations.
 14 | 
 15 |     All vector database implementations (ChromaDB, Pinecone, Weaviate, etc.)
 16 |     must implement this interface to be compatible with the RAG system.
 17 |     """
 18 | 
 19 |     @abstractmethod
 20 |     def create_collection(
 21 |         self, name: str, metadata: Optional[Dict[str, Any]] = None
 22 |     ) -> "Collection":
 23 |         """Create a new collection.
 24 | 
 25 |         Args:
 26 |             name: Name of the collection to create
 27 |             metadata: Optional metadata dictionary for the collection
 28 | 
 29 |         Returns:
 30 |             Collection: Abstract collection instance
 31 | 
 32 |         Raises:
 33 |             ValueError: If collection already exists
 34 |             RuntimeError: If database operation fails
 35 | 
 36 |         """
 37 | 
 38 |     @abstractmethod
 39 |     def get_collection(self, name: str) -> "Collection":
 40 |         """Get an existing collection.
 41 | 
 42 |         Args:
 43 |             name: Name of the collection to retrieve
 44 | 
 45 |         Returns:
 46 |             Collection: Abstract collection instance
 47 | 
 48 |         Raises:
 49 |             ValueError: If collection does not exist
 50 |             RuntimeError: If database operation fails
 51 | 
 52 |         """
 53 | 
 54 |     @abstractmethod
 55 |     def get_or_create_collection(
 56 |         self, name: str, metadata: Optional[Dict[str, Any]] = None
 57 |     ) -> "Collection":
 58 |         """Get an existing collection or create if it doesn't exist.
 59 | 
 60 |         Args:
 61 |             name: Name of the collection
 62 |             metadata: Optional metadata dictionary for the collection
 63 | 
 64 |         Returns:
 65 |             Collection: Abstract collection instance
 66 | 
 67 |         Raises:
 68 |             RuntimeError: If database operation fails
 69 | 
 70 |         """
 71 | 
 72 |     @abstractmethod
 73 |     def delete_collection(self, name: str) -> None:
 74 |         """Delete a collection.
 75 | 
 76 |         Args:
 77 |             name: Name of the collection to delete
 78 | 
 79 |         Raises:
 80 |             ValueError: If collection does not exist
 81 |             RuntimeError: If database operation fails
 82 | 
 83 |         """
 84 | 
 85 |     @abstractmethod
 86 |     def list_collections(self) -> List[str]:
 87 |         """List all collection names.
 88 | 
 89 |         Returns:
 90 |             List[str]: List of collection names
 91 | 
 92 |         Raises:
 93 |             RuntimeError: If database operation fails
 94 | 
 95 |         """
 96 | 
 97 | 
 98 | class Collection(ABC):
 99 |     """Abstract interface for collection operations.
100 | 
101 |     Represents a collection/index within a vector database where
102 |     documents and their embeddings are stored.
103 |     """
104 | 
105 |     @abstractmethod
106 |     def add(
107 |         self,
108 |         ids: List[str],
109 |         documents: List[str],
110 |         embeddings: List[List[float]],
111 |         metadatas: Optional[List[Dict[str, Any]]] = None,
112 |     ) -> None:
113 |         """Add documents with embeddings to the collection.
114 | 
115 |         Args:
116 |             ids: Unique identifiers for each document
117 |             documents: Text content of documents
118 |             embeddings: Vector embeddings for each document
119 |             metadatas: Optional metadata for each document
120 | 
121 |         Raises:
122 |             ValueError: If input lists have different lengths
123 |             RuntimeError: If database operation fails
124 | 
125 |         """
126 | 
127 |     @abstractmethod
128 |     def query(
129 |         self,
130 |         query_texts: Optional[List[str]] = None,
131 |         query_embeddings: Optional[List[List[float]]] = None,
132 |         n_results: int = 5,
133 |         where: Optional[Dict[str, Any]] = None,
134 |         include: Optional[List[str]] = None,
135 |     ) -> Dict[str, Any]:
136 |         """Query the collection for similar documents.
137 | 
138 |         Args:
139 |             query_texts: Query text(s) to search for
140 |             query_embeddings: Optional pre-computed query embeddings
141 |             n_results: Number of results to return per query
142 |             where: Optional metadata filter
143 |             include: Optional list of fields to include in results
144 |                    (e.g., ["documents", "metadatas", "distances"])
145 | 
146 |         Returns:
147 |             Dict[str, Any]: Query results with structure:
148 |                 {
149 |                     "ids": [[...]],  # List of lists of IDs
150 |                     "documents": [[...]],  # List of lists of documents
151 |                     "metadatas": [[...]],  # List of lists of metadata
152 |                     "distances": [[...]],  # List of lists of distances
153 |                 }
154 | 
155 |         Raises:
156 |             ValueError: If query parameters are invalid
157 |             RuntimeError: If database operation fails
158 | 
159 |         """
160 | 
161 |     @abstractmethod
162 |     def get(
163 |         self,
164 |         ids: Optional[List[str]] = None,
165 |         where: Optional[Dict[str, Any]] = None,
166 |         limit: Optional[int] = None,
167 |         offset: Optional[int] = None,
168 |         include: Optional[List[str]] = None,
169 |     ) -> Dict[str, Any]:
170 |         """Get documents from the collection.
171 | 
172 |         Args:
173 |             ids: Optional list of IDs to retrieve
174 |             where: Optional metadata filter
175 |             limit: Optional maximum number of results
176 |             offset: Optional number of results to skip
177 |             include: Optional list of fields to include
178 | 
179 |         Returns:
180 |             Dict[str, Any]: Documents with structure similar to query()
181 | 
182 |         Raises:
183 |             ValueError: If parameters are invalid
184 |             RuntimeError: If database operation fails
185 | 
186 |         """
187 | 
188 |     @abstractmethod
189 |     def count(self) -> int:
190 |         """Get the total number of documents in the collection.
191 | 
192 |         Returns:
193 |             int: Number of documents
194 | 
195 |         Raises:
196 |             RuntimeError: If database operation fails
197 | 
198 |         """
199 | 
200 |     @abstractmethod
201 |     def delete(
202 |         self,
203 |         ids: Optional[List[str]] = None,
204 |         where: Optional[Dict[str, Any]] = None,
205 |     ) -> None:
206 |         """Delete documents from the collection.
207 | 
208 |         Args:
209 |             ids: Optional list of IDs to delete
210 |             where: Optional metadata filter for deletion
211 | 
212 |         Raises:
213 |             ValueError: If neither ids nor where is provided
214 |             RuntimeError: If database operation fails
215 | 
216 |         """
217 | 
218 |     @abstractmethod
219 |     def update(
220 |         self,
221 |         ids: List[str],
222 |         documents: Optional[List[str]] = None,
223 |         embeddings: Optional[List[List[float]]] = None,
224 |         metadatas: Optional[List[Dict[str, Any]]] = None,
225 |     ) -> None:
226 |         """Update existing documents in the collection.
227 | 
228 |         Args:
229 |             ids: IDs of documents to update
230 |             documents: Optional new document texts
231 |             embeddings: Optional new embeddings
232 |             metadatas: Optional new metadata
233 | 
234 |         Raises:
235 |             ValueError: If IDs don't exist or parameters are invalid
236 |             RuntimeError: If database operation fails
237 | 
238 |         """
239 | 


--------------------------------------------------------------------------------
/tests/test_memory_api.py:
--------------------------------------------------------------------------------
  1 | """Tests for Memory public API interactions and edge cases."""
  2 | 
  3 | from datetime import datetime, timedelta, timezone
  4 | 
  5 | import pytest
  6 | 
  7 | 
  8 | class TestMemoryAPIEdgeCases:
  9 |     """Tests for Memory API edge cases."""
 10 | 
 11 |     def test_add_with_maximum_text_size(self, memory_api):
 12 |         """Test adding memory with maximum allowed text size."""
 13 |         from raggy.core.memory import MAX_MEMORY_SIZE
 14 | 
 15 |         # Create text that's just under the limit
 16 |         max_text = "x" * (MAX_MEMORY_SIZE - 10)
 17 | 
 18 |         memory_id = memory_api.add(text=max_text, memory_type="note")
 19 | 
 20 |         assert memory_id.startswith("mem_")
 21 |         retrieved = memory_api.get_by_id(memory_id)
 22 |         assert len(retrieved["text"]) == MAX_MEMORY_SIZE - 10
 23 | 
 24 |     def test_add_with_unicode_text(self, memory_api):
 25 |         """Test adding memory with unicode characters."""
 26 |         unicode_text = "Testing with émojis 🎉 and spëcial çharacters"
 27 | 
 28 |         memory_id = memory_api.add(text=unicode_text, memory_type="note")
 29 | 
 30 |         retrieved = memory_api.get_by_id(memory_id)
 31 |         assert unicode_text in retrieved["text"]
 32 | 
 33 |     def test_add_with_newlines_and_special_chars(self, memory_api):
 34 |         """Test adding memory with newlines and special characters."""
 35 |         special_text = """Line 1
 36 | Line 2
 37 | Tab:	here
 38 | Quote: "quoted"
 39 | Apostrophe: it's"""
 40 | 
 41 |         memory_id = memory_api.add(text=special_text, memory_type="note")
 42 | 
 43 |         retrieved = memory_api.get_by_id(memory_id)
 44 |         assert retrieved["text"] == special_text
 45 | 
 46 |     def test_add_with_confidence_boundaries(self, memory_api):
 47 |         """Test adding with confidence at exact boundaries."""
 48 |         # Test exact 0.0
 49 |         id1 = memory_api.add(text="Min confidence", memory_type="note", confidence=0.0)
 50 |         result1 = memory_api.get_by_id(id1)
 51 |         assert result1["metadata"]["confidence"] == 0.0
 52 | 
 53 |         # Test exact 1.0
 54 |         id2 = memory_api.add(text="Max confidence", memory_type="note", confidence=1.0)
 55 |         result2 = memory_api.get_by_id(id2)
 56 |         assert result2["metadata"]["confidence"] == 1.0
 57 | 
 58 |     def test_consecutive_deletes_and_adds(self, memory_api):
 59 |         """Test alternating delete and add operations."""
 60 |         ids = []
 61 |         for i in range(3):
 62 |             mem_id = memory_api.add(text=f"Memory {i}", memory_type="note")
 63 |             ids.append(mem_id)
 64 | 
 65 |         # Delete first
 66 |         memory_api.delete(ids[0])
 67 |         assert memory_api.get_by_id(ids[0]) is None
 68 |         assert memory_api.get_by_id(ids[1]) is not None
 69 | 
 70 |         # Add new
 71 |         new_id = memory_api.add(text="New memory 1", memory_type="decision")
 72 |         assert memory_api.get_by_id(new_id) is not None
 73 | 
 74 |         # Delete another
 75 |         memory_api.delete(ids[1])
 76 | 
 77 |         # Count should be 2 (new + ids[2])
 78 |         assert memory_api.count() == 2
 79 | 
 80 |     def test_add_many_memories_performance(self, memory_api):
 81 |         """Test adding many memories."""
 82 |         for i in range(20):
 83 |             memory_api.add(
 84 |                 text=f"Memory content {i}",
 85 |                 memory_type="note",
 86 |                 priority="high" if i % 3 == 0 else "medium"
 87 |             )
 88 | 
 89 |         count = memory_api.count()
 90 |         assert count == 20
 91 | 
 92 |     def test_metadata_persistence(self, memory_api):
 93 |         """Test that metadata is correctly persisted and retrieved."""
 94 |         memory_id = memory_api.add(
 95 |             text="Memory with metadata",
 96 |             memory_type="decision",
 97 |             priority="high",
 98 |             session_id="session-123",
 99 |             ai_model="test-model",
100 |             confidence=0.85,
101 |             custom_key="custom_value"
102 |         )
103 | 
104 |         retrieved = memory_api.get_by_id(memory_id)
105 |         metadata = retrieved["metadata"]
106 | 
107 |         assert metadata["memory_type"] == "decision"
108 |         assert metadata["priority"] == "high"
109 |         assert metadata["session_id"] == "session-123"
110 |         assert metadata["ai_model"] == "test-model"
111 |         assert metadata["confidence"] == 0.85
112 |         assert metadata["custom_key"] == "custom_value"
113 | 
114 |     def test_archive_with_valid_iso_dates(self, memory_api):
115 |         """Test archive with various ISO date formats."""
116 |         memory_api.add(text="Memory for archiving", memory_type="note")
117 | 
118 |         # ISO format with Z
119 |         cutoff_date = (datetime.now(timezone.utc) + timedelta(days=1)).isoformat()
120 |         if not cutoff_date.endswith('Z'):
121 |             cutoff_date = cutoff_date.split('+')[0] + 'Z'
122 | 
123 |         archived = memory_api.archive(cutoff_date)
124 |         assert archived == 1
125 | 
126 | 
127 | class TestMemoryAPIValidation:
128 |     """Tests for Memory API input validation."""
129 | 
130 |     @pytest.mark.parametrize("invalid_query", [""])
131 |     def test_get_context_with_empty_query_raises_error(self, memory_api, invalid_query):
132 |         """Test get_context_for_prompt with empty query raises error."""
133 |         with pytest.raises(ValueError, match="query must be a non-empty string"):
134 |             memory_api.get_context_for_prompt(invalid_query)
135 | 
136 |     @pytest.mark.skip(reason="get_context_for_prompt calls search - ChromaDB adapter issue")
137 |     def test_get_context_with_low_max_tokens_raises_error(self, memory_api):
138 |         """Test get_context_for_prompt with max_tokens < 100 raises error."""
139 |         with pytest.raises(ValueError, match="max_tokens must be >= 100"):
140 |             memory_api.get_context_for_prompt("test query", max_tokens=50)
141 | 
142 |     def test_delete_with_invalid_id_types(self, memory_api):
143 |         """Test delete with different invalid ID types."""
144 |         with pytest.raises(ValueError, match="memory_id must be a non-empty string"):
145 |             memory_api.delete("")
146 | 
147 |     def test_get_by_id_with_invalid_id_types(self, memory_api):
148 |         """Test get_by_id with different invalid ID types."""
149 |         with pytest.raises(ValueError, match="memory_id must be a non-empty string"):
150 |             memory_api.get_by_id("")
151 | 
152 | 
153 | class TestMemoryPriorityAndType:
154 |     """Tests for priority and memory type handling."""
155 | 
156 |     def test_all_memory_types_stored_and_retrieved(self, memory_api):
157 |         """Test all memory types can be stored and retrieved."""
158 |         memory_types = ["decision", "solution", "pattern", "learning", "error", "note"]
159 |         added_ids = {}
160 | 
161 |         for mem_type in memory_types:
162 |             mem_id = memory_api.add(
163 |                 text=f"Test {mem_type} memory",
164 |                 memory_type=mem_type
165 |             )
166 |             added_ids[mem_type] = mem_id
167 | 
168 |         # Verify all can be retrieved
169 |         for mem_type, mem_id in added_ids.items():
170 |             retrieved = memory_api.get_by_id(mem_id)
171 |             assert retrieved is not None
172 |             assert retrieved["metadata"]["memory_type"] == mem_type
173 | 
174 |     def test_all_priorities_stored_and_retrieved(self, memory_api):
175 |         """Test all priority levels can be stored and retrieved."""
176 |         priorities = ["high", "medium", "low"]
177 |         added_ids = {}
178 | 
179 |         for priority in priorities:
180 |             mem_id = memory_api.add(
181 |                 text=f"Test {priority} priority memory",
182 |                 memory_type="note",
183 |                 priority=priority
184 |             )
185 |             added_ids[priority] = mem_id
186 | 
187 |         # Verify all can be retrieved
188 |         for priority, mem_id in added_ids.items():
189 |             retrieved = memory_api.get_by_id(mem_id)
190 |             assert retrieved is not None
191 |             assert retrieved["metadata"]["priority"] == priority
192 | 
193 |     @pytest.mark.parametrize("invalid_type", ["unknown", "memo", "event", ""])
194 |     def test_invalid_memory_types_rejected(self, memory_api, invalid_type):
195 |         """Test that invalid memory types are rejected."""
196 |         with pytest.raises(ValueError, match="memory_type"):
197 |             memory_api.add(text="Test", memory_type=invalid_type)
198 | 
199 |     @pytest.mark.parametrize("invalid_priority", ["urgent", "critical", ""])
200 |     def test_invalid_priorities_rejected(self, memory_api, invalid_priority):
201 |         """Test that invalid priorities are rejected."""
202 |         with pytest.raises(ValueError, match="priority"):
203 |             memory_api.add(text="Test", memory_type="note", priority=invalid_priority)
204 | 


--------------------------------------------------------------------------------
/docs/configuration.md:
--------------------------------------------------------------------------------
  1 | # Configuration Guide
  2 | 
  3 | Raggy can be configured through CLI arguments, configuration files, or Python API parameters.
  4 | 
  5 | ## Configuration Files
  6 | 
  7 | Raggy supports two configuration formats:
  8 | 
  9 | ### .raggy.json (Recommended for v2.0+)
 10 | 
 11 | Modern JSON-based configuration with support for cloud vector databases and embedding providers:
 12 | 
 13 | ```json
 14 | {
 15 |   "vectorStore": {
 16 |     "provider": "chromadb",
 17 |     "chromadb": {
 18 |       "path": "./vectordb"
 19 |     }
 20 |   },
 21 |   "embedding": {
 22 |     "provider": "sentenceTransformers",
 23 |     "sentenceTransformers": {
 24 |       "model": "all-MiniLM-L6-v2"
 25 |     }
 26 |   }
 27 | }
 28 | ```
 29 | 
 30 | **Supported vector stores:** `chromadb`, `pinecone`, `supabase`
 31 | **Supported embedding providers:** `sentenceTransformers`, `openai`
 32 | 
 33 | See [Vector Databases Guide](./vector-databases.md) for detailed configuration examples.
 34 | 
 35 | ### raggy_config.yaml (Legacy)
 36 | 
 37 | Create `raggy_config.yaml` in your project root:
 38 | 
 39 | ```yaml
 40 | # Document and database paths
 41 | docs_dir: "./docs"
 42 | db_dir: "./vectordb"
 43 | 
 44 | # Embedding model
 45 | model: "all-MiniLM-L6-v2"
 46 | 
 47 | # Text chunking
 48 | chunk_size: 1000
 49 | chunk_overlap: 200
 50 | 
 51 | # Search settings
 52 | top_k: 5
 53 | hybrid: true
 54 | expand_query: false
 55 | 
 56 | # Memory system
 57 | memory_db_dir: "./memory_db"
 58 | ```
 59 | 
 60 | Load configuration:
 61 | 
 62 | ```bash
 63 | python raggy_cli.py build --config raggy_config.yaml
 64 | ```
 65 | 
 66 | ## Configuration Options
 67 | 
 68 | ### Paths
 69 | 
 70 | | Option | Type | Default | Description |
 71 | |--------|------|---------|-------------|
 72 | | `docs_dir` | string | `"./docs"` | Directory containing documents |
 73 | | `db_dir` | string | `"./vectordb"` | Vector database directory |
 74 | | `memory_db_dir` | string | `"./memory_db"` | Memory database directory |
 75 | 
 76 | ### Model Settings
 77 | 
 78 | | Option | Type | Default | Description |
 79 | |--------|------|---------|-------------|
 80 | | `model` | string | `"all-MiniLM-L6-v2"` | Embedding model name |
 81 | | `model_preset` | string | `null` | Preset: fast, balanced, multilingual, accurate |
 82 | 
 83 | ### Chunking Parameters
 84 | 
 85 | | Option | Type | Default | Description |
 86 | |--------|------|---------|-------------|
 87 | | `chunk_size` | integer | `1000` | Characters per chunk |
 88 | | `chunk_overlap` | integer | `200` | Overlap between chunks |
 89 | 
 90 | **Recommended values:**
 91 | - **Short documents** (tweets, comments): `chunk_size=500`, `chunk_overlap=50`
 92 | - **Standard documents** (articles, docs): `chunk_size=1000`, `chunk_overlap=200`
 93 | - **Long documents** (books, research): `chunk_size=1500`, `chunk_overlap=300`
 94 | 
 95 | ### Search Settings
 96 | 
 97 | | Option | Type | Default | Description |
 98 | |--------|------|---------|-------------|
 99 | | `top_k` | integer | `5` | Number of results to return |
100 | | `hybrid` | boolean | `false` | Enable hybrid search |
101 | | `expand_query` | boolean | `false` | Enable query expansion |
102 | 
103 | ## Model Presets
104 | 
105 | ### Fast
106 | ```yaml
107 | model_preset: fast
108 | ```
109 | - Model: `paraphrase-MiniLM-L3-v2`
110 | - Size: 17MB
111 | - Speed: Very fast
112 | - Accuracy: Good
113 | - Use case: Quick searches, prototyping
114 | 
115 | ### Balanced (Default)
116 | ```yaml
117 | model_preset: balanced
118 | ```
119 | - Model: `all-MiniLM-L6-v2`
120 | - Size: 80MB
121 | - Speed: Fast
122 | - Accuracy: Very good
123 | - Use case: General purpose
124 | 
125 | ### Multilingual
126 | ```yaml
127 | model_preset: multilingual
128 | ```
129 | - Model: `paraphrase-multilingual-MiniLM-L12-v2`
130 | - Size: 420MB
131 | - Speed: Moderate
132 | - Accuracy: Very good
133 | - Languages: 50+
134 | - Use case: Non-English content
135 | 
136 | ### Accurate
137 | ```yaml
138 | model_preset: accurate
139 | ```
140 | - Model: `all-mpnet-base-v2`
141 | - Size: 420MB
142 | - Speed: Slower
143 | - Accuracy: Excellent
144 | - Use case: Production systems requiring highest quality
145 | 
146 | ## Environment-Specific Configuration
147 | 
148 | ### Development
149 | 
150 | ```yaml
151 | # dev_config.yaml
152 | docs_dir: "./test_docs"
153 | db_dir: "./test_vectordb"
154 | model_preset: fast
155 | chunk_size: 500
156 | top_k: 3
157 | ```
158 | 
159 | ### Production
160 | 
161 | ```yaml
162 | # prod_config.yaml
163 | docs_dir: "/app/documents"
164 | db_dir: "/app/vectordb"
165 | model_preset: accurate
166 | chunk_size: 1000
167 | chunk_overlap: 200
168 | top_k: 10
169 | hybrid: true
170 | ```
171 | 
172 | ## Python API Configuration
173 | 
174 | ### Basic Configuration
175 | 
176 | ```python
177 | from raggy import UniversalRAG
178 | 
179 | rag = UniversalRAG(
180 |     docs_dir="./docs",
181 |     db_dir="./vectordb",
182 |     model="all-MiniLM-L6-v2",
183 |     chunk_size=1000,
184 |     chunk_overlap=200
185 | )
186 | ```
187 | 
188 | ### Advanced Configuration
189 | 
190 | ```python
191 | from raggy import UniversalRAG
192 | from raggy.config.loader import load_config
193 | 
194 | # Load from file
195 | config = load_config("raggy_config.yaml")
196 | 
197 | # Override specific settings
198 | config["top_k"] = 10
199 | config["hybrid"] = True
200 | 
201 | # Initialize with config
202 | rag = UniversalRAG(**config)
203 | ```
204 | 
205 | ## Memory System Configuration
206 | 
207 | ### CLI Configuration
208 | 
209 | ```bash
210 | # Custom memory database location
211 | python raggy_cli.py remember "content" --db-dir ./custom_memory
212 | ```
213 | 
214 | ### Python API Configuration
215 | 
216 | ```python
217 | from raggy import Memory
218 | 
219 | memory = Memory(
220 |     db_dir="./memory_db",
221 |     model="all-MiniLM-L6-v2",
222 |     chunk_size=1000
223 | )
224 | ```
225 | 
226 | ## Performance Tuning
227 | 
228 | ### For Speed
229 | 
230 | ```yaml
231 | model_preset: fast
232 | chunk_size: 800
233 | top_k: 5
234 | ```
235 | 
236 | ### For Accuracy
237 | 
238 | ```yaml
239 | model_preset: accurate
240 | chunk_size: 1200
241 | chunk_overlap: 250
242 | top_k: 15
243 | hybrid: true
244 | expand_query: true
245 | ```
246 | 
247 | ### For Multilingual
248 | 
249 | ```yaml
250 | model_preset: multilingual
251 | chunk_size: 1000
252 | chunk_overlap: 200
253 | ```
254 | 
255 | ## Example Configurations
256 | 
257 | ### Technical Documentation
258 | 
259 | ```yaml
260 | docs_dir: "./api-docs"
261 | db_dir: "./vectordb"
262 | model_preset: balanced
263 | chunk_size: 1500
264 | chunk_overlap: 300
265 | hybrid: true
266 | top_k: 10
267 | ```
268 | 
269 | ### Research Papers
270 | 
271 | ```yaml
272 | docs_dir: "./papers"
273 | db_dir: "./vectordb"
274 | model_preset: accurate
275 | chunk_size: 2000
276 | chunk_overlap: 400
277 | expand_query: true
278 | top_k: 15
279 | ```
280 | 
281 | ### Quick Notes Search
282 | 
283 | ```yaml
284 | docs_dir: "./notes"
285 | db_dir: "./vectordb"
286 | model_preset: fast
287 | chunk_size: 500
288 | chunk_overlap: 50
289 | top_k: 5
290 | ```
291 | 
292 | ## Configuration Priority
293 | 
294 | When multiple configuration sources are present:
295 | 
296 | 1. **CLI arguments** (highest priority)
297 | 2. **Configuration file** (`--config` flag)
298 | 3. **Default values** (lowest priority)
299 | 
300 | Example:
301 | 
302 | ```bash
303 | # chunk_size will be 1500 (CLI overrides config file)
304 | python raggy_cli.py build --config config.yaml --chunk-size 1500
305 | ```
306 | 
307 | ## Cloud Vector Database Configuration
308 | 
309 | ### Pinecone Configuration (.raggy.json)
310 | 
311 | ```json
312 | {
313 |   "vectorStore": {
314 |     "provider": "pinecone",
315 |     "pinecone": {
316 |       "apiKey": "${PINECONE_API_KEY}",
317 |       "environment": "us-east-1-aws",
318 |       "indexName": "raggy-index",
319 |       "dimension": 1536
320 |     }
321 |   },
322 |   "embedding": {
323 |     "provider": "openai",
324 |     "openai": {
325 |       "apiKey": "${OPENAI_API_KEY}",
326 |       "model": "text-embedding-3-small"
327 |     }
328 |   }
329 | }
330 | ```
331 | 
332 | **Environment variables:**
333 | ```bash
334 | export PINECONE_API_KEY="pcsk_..."
335 | export OPENAI_API_KEY="sk-proj-..."
336 | ```
337 | 
338 | ### Supabase Configuration (.raggy.json)
339 | 
340 | ```json
341 | {
342 |   "vectorStore": {
343 |     "provider": "supabase",
344 |     "supabase": {
345 |       "url": "${SUPABASE_URL}",
346 |       "apiKey": "${SUPABASE_ANON_KEY}",
347 |       "dimension": 384
348 |     }
349 |   },
350 |   "embedding": {
351 |     "provider": "sentenceTransformers",
352 |     "sentenceTransformers": {
353 |       "model": "all-MiniLM-L6-v2"
354 |     }
355 |   }
356 | }
357 | ```
358 | 
359 | **Environment variables:**
360 | ```bash
361 | export SUPABASE_URL="https://xxxxx.supabase.co"
362 | export SUPABASE_ANON_KEY="eyJhbGc..."
363 | ```
364 | 
365 | ### Interactive Setup
366 | 
367 | The easiest way to configure cloud databases:
368 | 
369 | ```bash
370 | python raggy_cli.py init --interactive
371 | ```
372 | 
373 | This will guide you through:
374 | 1. Selecting a vector database (ChromaDB, Pinecone, Supabase)
375 | 2. Selecting an embedding provider (SentenceTransformers, OpenAI)
376 | 3. Entering API keys and credentials
377 | 4. Creating `.raggy.json` configuration file
378 | 
379 | ## Next Steps
380 | 
381 | - [Vector Databases Guide](./vector-databases.md) - Detailed cloud database setup
382 | - [Performance Tuning](./performance.md)
383 | - [Model Selection Guide](./model-selection.md)
384 | - [API Reference](./api-reference.md)
385 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | ![raggy](raggy.png)
  3 | 
  4 | # Raggy - Universal RAG System
  5 | 
  6 | A powerful, drop-in RAG (Retrieval-Augmented Generation) solution with hybrid search and AI development memory.
  7 | 
  8 | ## Quick Start
  9 | 
 10 | ### Installation
 11 | 
 12 | ```bash
 13 | pip install raggy
 14 | ```
 15 | 
 16 | ### Basic Usage
 17 | 
 18 | **Document Search (RAG System):**
 19 | 
 20 | ```python
 21 | from raggy import UniversalRAG
 22 | 
 23 | # Initialize with your documents directory
 24 | rag = UniversalRAG(docs_dir="./docs")
 25 | 
 26 | # Build the vector database
 27 | rag.build()
 28 | 
 29 | # Search your documents
 30 | results = rag.search("machine learning algorithms", top_k=5)
 31 | 
 32 | for result in results:
 33 |     print(f"Score: {result['score']:.2f}")
 34 |     print(f"File: {result['file']}")
 35 |     print(f"Content: {result['content']}\n")
 36 | ```
 37 | 
 38 | **Development Memory (New in v2.0):**
 39 | 
 40 | ```python
 41 | from raggy import Memory, remember, recall
 42 | 
 43 | # Store development context
 44 | mem_id = remember(
 45 |     "Decided to use ChromaDB for vector storage because it's lightweight",
 46 |     memory_type="decision",
 47 |     tags=["architecture", "database"]
 48 | )
 49 | 
 50 | # Retrieve memories
 51 | results = recall("database decisions", limit=5)
 52 | 
 53 | for memory in results:
 54 |     print(f"[{memory['type']}] {memory['content']}")
 55 |     print(f"Tags: {', '.join(memory['tags'])}\n")
 56 | ```
 57 | 
 58 | ### CLI Usage
 59 | 
 60 | **Document Search:**
 61 | 
 62 | ```bash
 63 | # Initialize project (first time)
 64 | python raggy_cli.py init
 65 | 
 66 | # Build vector database
 67 | python raggy_cli.py build
 68 | 
 69 | # Search documents
 70 | python raggy_cli.py search "your query here"
 71 | 
 72 | # Hybrid search (semantic + keyword)
 73 | python raggy_cli.py search "api documentation" --hybrid
 74 | ```
 75 | 
 76 | **Memory Management:**
 77 | 
 78 | ```bash
 79 | # Store a memory
 80 | python raggy_cli.py remember "Fixed authentication bug in login handler"
 81 | 
 82 | # Recall memories
 83 | python raggy_cli.py recall "bug fix"
 84 | 
 85 | # Unified search (docs + memory)
 86 | python raggy_cli.py search "authentication" --include-memory
 87 | ```
 88 | 
 89 | ## Features
 90 | 
 91 | ### Document Search (RAG)
 92 | - **Hybrid Search**: Combines semantic understanding with keyword matching
 93 | - **Smart Chunking**: Automatically splits documents for optimal retrieval
 94 | - **Multi-format Support**: PDF, DOCX, Markdown, and plain text
 95 | - **Normalized Scoring**: Interpretable 0-100 relevance scores
 96 | - **Query Expansion**: Automatically expands queries with synonyms
 97 | 
 98 | ### Memory System (New in v2.0)
 99 | - **Context Persistence**: Store development decisions, solutions, and learnings
100 | - **Type-based Organization**: Decisions, solutions, patterns, learnings, errors, notes
101 | - **Tag-based Retrieval**: Categorize and find memories efficiently
102 | - **Priority Levels**: Mark important memories with high/medium/low priority
103 | - **Time-based Filtering**: Find recent memories or archive old ones
104 | - **Unified Search**: Search both documents and memories together
105 | 
106 | ### Cloud Vector Databases (New in v2.0)
107 | - **ChromaDB**: Local-first, zero-config vector storage (default)
108 | - **Pinecone**: Serverless cloud vector database with auto-scaling
109 | - **Supabase**: PostgreSQL + pgvector for full-stack applications
110 | - **OpenAI Embeddings**: High-quality embeddings with text-embedding-3-small/large
111 | - **Interactive Setup**: Guided configuration wizard for cloud databases
112 | 
113 | ### Model Presets
114 | - **Fast**: Quick responses, lower accuracy (`paraphrase-MiniLM-L3-v2`)
115 | - **Balanced**: Good balance of speed and accuracy (default)
116 | - **Multilingual**: Support for 50+ languages
117 | - **Accurate**: Best quality, slower processing
118 | 
119 | ## Configuration
120 | 
121 | ### Local Configuration (ChromaDB)
122 | 
123 | Create `.raggy.json` for local vector storage:
124 | 
125 | ```json
126 | {
127 |   "vectorStore": {
128 |     "provider": "chromadb",
129 |     "chromadb": {
130 |       "path": "./vectordb"
131 |     }
132 |   },
133 |   "embedding": {
134 |     "provider": "sentenceTransformers",
135 |     "sentenceTransformers": {
136 |       "model": "all-MiniLM-L6-v2"
137 |     }
138 |   }
139 | }
140 | ```
141 | 
142 | ### Cloud Configuration (Pinecone + OpenAI)
143 | 
144 | Create `.raggy.json` for cloud deployment:
145 | 
146 | ```json
147 | {
148 |   "vectorStore": {
149 |     "provider": "pinecone",
150 |     "pinecone": {
151 |       "apiKey": "${PINECONE_API_KEY}",
152 |       "environment": "us-east-1-aws",
153 |       "indexName": "raggy-index",
154 |       "dimension": 1536
155 |     }
156 |   },
157 |   "embedding": {
158 |     "provider": "openai",
159 |     "openai": {
160 |       "apiKey": "${OPENAI_API_KEY}",
161 |       "model": "text-embedding-3-small"
162 |     }
163 |   }
164 | }
165 | ```
166 | 
167 | **Interactive setup wizard:**
168 | ```bash
169 | python raggy_cli.py init --interactive
170 | ```
171 | 
172 | ### Legacy YAML Configuration
173 | 
174 | Create `raggy_config.yaml` for custom settings:
175 | 
176 | ```yaml
177 | docs_dir: "./docs"
178 | db_dir: "./vectordb"
179 | model: "all-MiniLM-L6-v2"
180 | chunk_size: 1000
181 | chunk_overlap: 200
182 | top_k: 5
183 | ```
184 | 
185 | ## Advanced Usage
186 | 
187 | ### Python API
188 | 
189 | ```python
190 | from raggy import UniversalRAG, Memory
191 | 
192 | # Custom configuration
193 | rag = UniversalRAG(
194 |     docs_dir="./my_docs",
195 |     db_dir="./my_vectordb",
196 |     model="all-MiniLM-L6-v2",
197 |     chunk_size=1000,
198 |     top_k=10
199 | )
200 | 
201 | # Force rebuild database
202 | rag.build(force_rebuild=True)
203 | 
204 | # Hybrid search with query expansion
205 | results = rag.search(
206 |     "machine learning",
207 |     hybrid=True,
208 |     expand_query=True
209 | )
210 | 
211 | # Memory with metadata
212 | memory = Memory(db_dir="./memory_db")
213 | mem_id = memory.add(
214 |     content="Refactored search engine to use dependency injection",
215 |     memory_type="pattern",
216 |     tags=["refactoring", "architecture", "search"],
217 |     priority="high",
218 |     files=["raggy/core/search.py", "raggy/core/rag.py"]
219 | )
220 | 
221 | # Search with filters
222 | results = memory.search(
223 |     "refactoring patterns",
224 |     memory_type="pattern",
225 |     tags=["architecture"],
226 |     since="2025-01-01",
227 |     limit=10
228 | )
229 | ```
230 | 
231 | ### CLI Advanced Examples
232 | 
233 | ```bash
234 | # Query expansion + hybrid search
235 | python raggy_cli.py search "api" --hybrid --expand
236 | 
237 | # JSON output for integration
238 | python raggy_cli.py search "query" --json
239 | 
240 | # Multilingual model
241 | python raggy_cli.py build --model-preset multilingual
242 | 
243 | # Memory with metadata
244 | python raggy_cli.py remember "Bug fix" \
245 |   --type solution \
246 |   --tags "bug,fix,auth" \
247 |   --priority high \
248 |   --files "auth.py,login.py"
249 | 
250 | # Time-based recall
251 | python raggy_cli.py recall "recent changes" --last 7d
252 | 
253 | # Archive old memories
254 | python raggy_cli.py forget --archive --older-than 90d
255 | ```
256 | 
257 | ## Requirements
258 | 
259 | ### Core Requirements
260 | - Python 3.8+
261 | - ChromaDB 0.4.0+ (included by default)
262 | - sentence-transformers 2.2.0+ (included by default)
263 | - PyPDF2 3.0.0+ (for PDF support)
264 | - python-docx 1.0.0+ (for DOCX support)
265 | 
266 | ### Optional Cloud Database Support
267 | - **Pinecone**: `pip install "raggy[pinecone]"` or `pip install pinecone[grpc]`
268 | - **Supabase**: `pip install "raggy[supabase]"` or `pip install supabase`
269 | - **OpenAI Embeddings**: `pip install openai` (for text-embedding-3-small/large)
270 | 
271 | ## Documentation
272 | 
273 | Comprehensive guides and references:
274 | 
275 | ### Getting Started
276 | - [Setup Guide](./docs/setup-guide.md) - Quick setup for local and cloud deployments
277 | - [Installation Guide](./docs/installation.md) - Detailed installation instructions
278 | - [Quick Start Tutorial](./docs/quickstart.md) - Step-by-step tutorial
279 | - [Configuration Guide](./docs/configuration.md) - All configuration options
280 | 
281 | ### Core Features
282 | - [Document Search (RAG)](./docs/rag-system.md) - RAG system documentation
283 | - [Memory System](./docs/memory-system.md) - AI development memory guide
284 | - [Vector Databases](./docs/vector-databases.md) - ChromaDB, Pinecone, Supabase guide
285 | - [Hybrid Search](./docs/hybrid-search.md) - Semantic + keyword search
286 | - [Query Expansion](./docs/query-expansion.md) - Automatic query enhancement
287 | 
288 | ### API Reference
289 | - [Python API Reference](./docs/api-reference.md) - Complete API documentation
290 | - [CLI Reference](./docs/cli-reference.md) - All CLI commands
291 | - [Memory API Quick Reference](./docs/MEMORY_API_QUICK_REFERENCE.md) - Memory system API
292 | 
293 | ### Advanced Topics
294 | - [Model Selection](./docs/model-selection.md) - Choosing the right model
295 | - [Performance Tuning](./docs/performance.md) - Optimization guide
296 | - [Custom Embedding Models](./docs/custom-models.md) - Using custom models
297 | - [Integration Patterns](./docs/integration.md) - Integrating with your project
298 | 
299 | ### Development
300 | - [Contributing Guide](./docs/contributing.md) - How to contribute
301 | - [Architecture Overview](./docs/architecture.md) - System design
302 | - [Testing Guide](./docs/testing.md) - Running and writing tests
303 | 
304 | ### Migration & Troubleshooting
305 | - [Migration Guide](./docs/migration.md) - Upgrading from v1.x
306 | - [Troubleshooting](./docs/troubleshooting.md) - Common issues and solutions
307 | - [FAQ](./docs/faq.md) - Frequently asked questions
308 | 
309 | ## License
310 | 
311 | MIT License - see LICENSE file for details
312 | 
313 | ## Contributing
314 | 
315 | Contributions welcome! See [Contributing Guide](./docs/contributing.md) for details.
316 | 
317 | ## Version
318 | 
319 | Current version: 2.0.0
320 | 


--------------------------------------------------------------------------------
/tests/test_bm25.py:
--------------------------------------------------------------------------------
  1 | """Tests for BM25Scorer functionality."""
  2 | 
  3 | import pytest
  4 | import math
  5 | from raggy import BM25Scorer
  6 | 
  7 | 
  8 | class TestBM25Scorer:
  9 |     """Test the BM25Scorer class."""
 10 |     
 11 |     def test_initialization(self):
 12 |         """Test BM25Scorer initialization with default parameters."""
 13 |         scorer = BM25Scorer()
 14 |         assert scorer.k1 == 1.2
 15 |         assert scorer.b == 0.75
 16 |         assert scorer.doc_count == 0
 17 |         assert scorer.avg_doc_length == 0
 18 |         assert len(scorer.doc_lengths) == 0
 19 |         assert len(scorer.term_frequencies) == 0
 20 |         assert len(scorer.idf_scores) == 0
 21 |     
 22 |     def test_initialization_custom_params(self):
 23 |         """Test BM25Scorer initialization with custom parameters."""
 24 |         scorer = BM25Scorer(k1=1.5, b=0.8)
 25 |         assert scorer.k1 == 1.5
 26 |         assert scorer.b == 0.8
 27 |     
 28 |     def test_tokenize(self):
 29 |         """Test the tokenization method."""
 30 |         scorer = BM25Scorer()
 31 |         
 32 |         # Test basic tokenization
 33 |         tokens = scorer._tokenize("The quick brown fox")
 34 |         assert tokens == ["the", "quick", "brown", "fox"]
 35 |         
 36 |         # Test with punctuation
 37 |         tokens = scorer._tokenize("Hello, world! How are you?")
 38 |         assert tokens == ["hello", "world", "how", "are", "you"]
 39 |         
 40 |         # Test with numbers and special characters
 41 |         tokens = scorer._tokenize("API-v1.2 test_function() $variable")
 42 |         assert tokens == ["api", "v1", "2", "test_function", "variable"]
 43 |         
 44 |         # Test empty string
 45 |         tokens = scorer._tokenize("")
 46 |         assert tokens == []
 47 |     
 48 |     def test_fit_simple_documents(self, bm25_sample_documents):
 49 |         """Test fitting BM25 with simple documents."""
 50 |         scorer = BM25Scorer()
 51 |         scorer.fit(bm25_sample_documents)
 52 |         
 53 |         # Check basic stats
 54 |         assert scorer.doc_count == len(bm25_sample_documents)
 55 |         assert len(scorer.doc_lengths) == len(bm25_sample_documents)
 56 |         assert len(scorer.term_frequencies) == len(bm25_sample_documents)
 57 |         assert scorer.avg_doc_length > 0
 58 |         
 59 |         # Check that IDF scores were calculated
 60 |         assert len(scorer.idf_scores) > 0
 61 |         
 62 |         # Verify some expected terms are present
 63 |         assert "the" in scorer.idf_scores
 64 |         assert "quick" in scorer.idf_scores
 65 |         assert "fox" in scorer.idf_scores
 66 |     
 67 |     def test_fit_calculates_correct_doc_lengths(self):
 68 |         """Test that document lengths are calculated correctly."""
 69 |         documents = [
 70 |             "one two three",  # 3 words
 71 |             "four five",      # 2 words  
 72 |             "six seven eight nine ten"  # 5 words
 73 |         ]
 74 |         
 75 |         scorer = BM25Scorer()
 76 |         scorer.fit(documents)
 77 |         
 78 |         assert scorer.doc_lengths == [3, 2, 5]
 79 |         assert scorer.avg_doc_length == (3 + 2 + 5) / 3
 80 |     
 81 |     def test_fit_calculates_idf_scores(self):
 82 |         """Test IDF score calculation."""
 83 |         documents = [
 84 |             "the quick brown fox",    # 'the' appears in doc 0
 85 |             "a quick brown dog",      # 'the' doesn't appear
 86 |             "the lazy dog sleeps"     # 'the' appears in doc 2
 87 |         ]
 88 |         
 89 |         scorer = BM25Scorer()
 90 |         scorer.fit(documents)
 91 |         
 92 |         # 'the' appears in 2 out of 3 documents
 93 |         # IDF = log((N + 1) / df) = log((3 + 1) / 2) = log(2)
 94 |         expected_idf_the = math.log((3 + 1) / 2)
 95 |         assert abs(scorer.idf_scores["the"] - expected_idf_the) < 1e-6
 96 | 
 97 |         # 'quick' appears in 2 out of 3 documents
 98 |         expected_idf_quick = math.log((3 + 1) / 2)
 99 |         assert abs(scorer.idf_scores["quick"] - expected_idf_quick) < 1e-6
100 | 
101 |         # 'fox' appears in 1 out of 3 documents
102 |         expected_idf_fox = math.log((3 + 1) / 1)
103 |         assert abs(scorer.idf_scores["fox"] - expected_idf_fox) < 1e-6
104 |     
105 |     def test_score_simple_query(self):
106 |         """Test scoring a simple query against documents."""
107 |         documents = [
108 |             "the quick brown fox jumps",
109 |             "the lazy dog sleeps", 
110 |             "a fox runs quickly"
111 |         ]
112 |         
113 |         scorer = BM25Scorer()
114 |         scorer.fit(documents)
115 |         
116 |         # Score query "fox" against each document
117 |         scores = [scorer.score("fox", i) for i in range(len(documents))]
118 |         
119 |         # Document 0 and 2 contain "fox", document 1 doesn't
120 |         assert scores[0] > 0  # Contains "fox"
121 |         assert scores[1] == 0  # Doesn't contain "fox"  
122 |         assert scores[2] > 0  # Contains "fox"
123 |     
124 |     def test_score_multi_term_query(self):
125 |         """Test scoring a multi-term query."""
126 |         documents = [
127 |             "machine learning algorithms",
128 |             "natural language processing", 
129 |             "machine learning techniques for natural language"
130 |         ]
131 |         
132 |         scorer = BM25Scorer()
133 |         scorer.fit(documents)
134 |         
135 |         # Query contains both terms in documents 0 and 2
136 |         scores = [scorer.score("machine learning", i) for i in range(len(documents))]
137 |         
138 |         assert scores[0] > 0  # Contains both terms
139 |         assert scores[1] == 0  # Contains neither term
140 |         assert scores[2] > 0  # Contains both terms
141 |         
142 |         # Document 2 has both terms plus additional context, might score differently
143 |         assert all(score >= 0 for score in scores)  # All scores non-negative
144 |     
145 |     def test_score_nonexistent_query_term(self):
146 |         """Test scoring with query terms not in any document."""
147 |         documents = ["cat dog bird", "fish whale shark"]
148 |         
149 |         scorer = BM25Scorer()
150 |         scorer.fit(documents)
151 |         
152 |         # Query with term not in any document
153 |         scores = [scorer.score("elephant", i) for i in range(len(documents))]
154 |         
155 |         # All scores should be 0
156 |         assert all(score == 0 for score in scores)
157 |     
158 |     def test_score_invalid_document_index(self):
159 |         """Test scoring with invalid document index."""
160 |         documents = ["test document"]
161 |         
162 |         scorer = BM25Scorer()
163 |         scorer.fit(documents)
164 |         
165 |         # Invalid document index should return 0
166 |         assert scorer.score("test", 5) == 0  # Index out of range
167 |         assert scorer.score("test", -1) == 0  # Negative index
168 |     
169 |     def test_score_relevance_ranking(self):
170 |         """Test that BM25 scores rank documents by relevance correctly."""
171 |         documents = [
172 |             "machine learning is a subset of artificial intelligence",  # Relevant
173 |             "cats and dogs are pets",  # Not relevant
174 |             "machine learning algorithms use statistical methods",  # Very relevant  
175 |             "learning to cook is fun"  # Somewhat relevant (contains 'learning')
176 |         ]
177 |         
178 |         scorer = BM25Scorer()
179 |         scorer.fit(documents)
180 |         
181 |         query = "machine learning algorithms"
182 |         scores = [scorer.score(query, i) for i in range(len(documents))]
183 |         
184 |         # Document 2 should score highest (has all query terms)
185 |         # Document 0 should score second (has 'machine learning')  
186 |         # Document 1 should score lowest (no relevant terms)
187 |         # Document 3 should score low (only has 'learning')
188 |         
189 |         assert scores[2] > scores[0]  # Most relevant doc scores highest
190 |         assert scores[0] > scores[3]  # More relevant than partial match
191 |         assert scores[3] > scores[1] or scores[3] == 0  # Partial match better than none
192 |         assert scores[1] == 0  # Irrelevant document scores 0
193 |     
194 |     def test_score_frequency_affects_ranking(self):
195 |         """Test that term frequency affects BM25 scores."""
196 |         documents = [
197 |             "machine learning",  # Term appears once each
198 |             "machine machine learning learning machine",  # Terms appear multiple times
199 |             "deep neural networks"  # Different terms
200 |         ]
201 |         
202 |         scorer = BM25Scorer()
203 |         scorer.fit(documents)
204 |         
205 |         query = "machine learning"
206 |         scores = [scorer.score(query, i) for i in range(len(documents))]
207 |         
208 |         # Document 1 has higher term frequency, should score higher than document 0
209 |         assert scores[1] > scores[0]
210 |         assert scores[2] == 0  # No matching terms
211 |     
212 |     def test_empty_documents_list(self):
213 |         """Test BM25 with empty documents list."""
214 |         scorer = BM25Scorer()
215 |         scorer.fit([])
216 |         
217 |         assert scorer.doc_count == 0
218 |         assert scorer.avg_doc_length == 0
219 |         assert len(scorer.idf_scores) == 0
220 |     
221 |     def test_single_document(self):
222 |         """Test BM25 with a single document."""
223 |         documents = ["single test document"]
224 |         
225 |         scorer = BM25Scorer()
226 |         scorer.fit(documents)
227 |         
228 |         assert scorer.doc_count == 1
229 |         assert len(scorer.doc_lengths) == 1
230 |         
231 |         # Score should work with single document
232 |         score = scorer.score("test", 0)
233 |         assert score > 0
234 |     
235 |     @pytest.mark.parametrize("k1,b", [
236 |         (1.0, 0.5),
237 |         (1.5, 1.0),
238 |         (2.0, 0.0),
239 |         (0.5, 0.75)
240 |     ])
241 |     def test_different_parameters(self, k1, b):
242 |         """Test BM25 with different k1 and b parameters."""
243 |         documents = ["test document for parameter testing"]
244 |         
245 |         scorer = BM25Scorer(k1=k1, b=b)
246 |         scorer.fit(documents)
247 |         
248 |         score = scorer.score("test", 0)
249 |         assert score >= 0  # Score should always be non-negative
250 |         assert isinstance(score, (int, float))


--------------------------------------------------------------------------------
/raggy/config/raggy_config.py:
--------------------------------------------------------------------------------
  1 | """Configuration management for Raggy.
  2 | 
  3 | This module handles loading and validating .raggy.json configuration files,
  4 | with support for environment variable substitution and multiple discovery methods.
  5 | """
  6 | 
  7 | import json
  8 | import os
  9 | import re
 10 | from pathlib import Path
 11 | from typing import Any, Dict, Optional
 12 | 
 13 | 
 14 | class RaggyConfig:
 15 |     """Raggy configuration manager with support for .raggy.json files."""
 16 | 
 17 |     # Default configuration
 18 |     DEFAULT_CONFIG = {
 19 |         "vectorStore": {
 20 |             "provider": "chromadb",
 21 |             "chromadb": {
 22 |                 "path": "./vectordb"
 23 |             }
 24 |         },
 25 |         "embedding": {
 26 |             "provider": "sentence-transformers",
 27 |             "sentenceTransformers": {
 28 |                 "model": "all-MiniLM-L6-v2"
 29 |             }
 30 |         },
 31 |         "memory": {
 32 |             "categoriesMode": "append",
 33 |             "categories": {
 34 |                 "add": [],
 35 |                 "remove": [],
 36 |                 "replace": []
 37 |             }
 38 |         }
 39 |     }
 40 | 
 41 |     def __init__(self, config_path: Optional[str] = None):
 42 |         """Initialize configuration.
 43 | 
 44 |         Args:
 45 |             config_path: Optional explicit path to config file.
 46 |                 If not provided, will attempt discovery in order:
 47 |                 1. RAGGY_CONFIG_PATH environment variable
 48 |                 2. .raggy.json in current working directory
 49 | 
 50 |         """
 51 |         self.config_path = self._discover_config(config_path)
 52 |         self.config = self._load_config()
 53 | 
 54 |     def _discover_config(self, explicit_path: Optional[str] = None) -> Optional[Path]:
 55 |         """Discover configuration file.
 56 | 
 57 |         Priority order:
 58 |         1. Explicit path argument
 59 |         2. RAGGY_CONFIG_PATH environment variable
 60 |         3. .raggy.json in current working directory
 61 | 
 62 |         Args:
 63 |             explicit_path: Optional explicit path to config file
 64 | 
 65 |         Returns:
 66 |             Path to config file if found, None otherwise
 67 | 
 68 |         """
 69 |         # 1. Check explicit path argument
 70 |         if explicit_path:
 71 |             path = Path(explicit_path)
 72 |             if path.exists():
 73 |                 return path
 74 |             raise FileNotFoundError(f"Config file not found: {explicit_path}")
 75 | 
 76 |         # 2. Check environment variable
 77 |         env_path = os.getenv("RAGGY_CONFIG_PATH")
 78 |         if env_path:
 79 |             path = Path(env_path)
 80 |             if path.exists():
 81 |                 return path
 82 |             raise FileNotFoundError(
 83 |                 f"Config file not found at RAGGY_CONFIG_PATH: {env_path}"
 84 |             )
 85 | 
 86 |         # 3. Check current working directory
 87 |         cwd_config = Path.cwd() / ".raggy.json"
 88 |         if cwd_config.exists():
 89 |             return cwd_config
 90 | 
 91 |         # No config found - use defaults
 92 |         return None
 93 | 
 94 |     def _load_config(self) -> Dict[str, Any]:
 95 |         """Load and validate configuration.
 96 | 
 97 |         Returns:
 98 |             Dict: Merged configuration (defaults + file config)
 99 | 
100 |         """
101 |         # Start with defaults
102 |         config = self._deep_copy(self.DEFAULT_CONFIG)
103 | 
104 |         # If no config file, return defaults
105 |         if not self.config_path:
106 |             return config
107 | 
108 |         # Load config file
109 |         try:
110 |             with open(self.config_path, encoding="utf-8") as f:
111 |                 file_config = json.load(f)
112 | 
113 |             # Merge with defaults (file config takes precedence)
114 |             config = self._deep_merge(config, file_config)
115 | 
116 |             # Substitute environment variables
117 |             return self._substitute_env_vars(config)
118 | 
119 | 
120 |         except json.JSONDecodeError as e:
121 |             raise ValueError(f"Invalid JSON in config file {self.config_path}: {e}")
122 |         except Exception as e:
123 |             raise RuntimeError(f"Failed to load config from {self.config_path}: {e}")
124 | 
125 |     def _deep_copy(self, obj: Any) -> Any:
126 |         """Deep copy a nested dictionary.
127 | 
128 |         Args:
129 |             obj: Object to copy
130 | 
131 |         Returns:
132 |             Deep copy of object
133 | 
134 |         """
135 |         if isinstance(obj, dict):
136 |             return {k: self._deep_copy(v) for k, v in obj.items()}
137 |         elif isinstance(obj, list):
138 |             return [self._deep_copy(item) for item in obj]
139 |         else:
140 |             return obj
141 | 
142 |     def _deep_merge(self, base: Dict[str, Any], override: Dict[str, Any]) -> Dict[str, Any]:
143 |         """Deep merge two dictionaries.
144 | 
145 |         Args:
146 |             base: Base dictionary
147 |             override: Override dictionary (takes precedence)
148 | 
149 |         Returns:
150 |             Merged dictionary
151 | 
152 |         """
153 |         result = self._deep_copy(base)
154 | 
155 |         for key, value in override.items():
156 |             if key in result and isinstance(result[key], dict) and isinstance(value, dict):
157 |                 result[key] = self._deep_merge(result[key], value)
158 |             else:
159 |                 result[key] = self._deep_copy(value)
160 | 
161 |         return result
162 | 
163 |     def _substitute_env_vars(self, obj: Any) -> Any:
164 |         """Recursively substitute ${ENV_VAR} placeholders with environment variables.
165 | 
166 |         Args:
167 |             obj: Object to process (can be dict, list, str, or other)
168 | 
169 |         Returns:
170 |             Object with substituted values
171 | 
172 |         """
173 |         if isinstance(obj, dict):
174 |             return {k: self._substitute_env_vars(v) for k, v in obj.items()}
175 |         elif isinstance(obj, list):
176 |             return [self._substitute_env_vars(item) for item in obj]
177 |         elif isinstance(obj, str):
178 |             # Match ${VAR_NAME} pattern
179 |             pattern = r'\$\{([^}]+)\}'
180 | 
181 |             def replace_env_var(match):
182 |                 var_name = match.group(1)
183 |                 value = os.getenv(var_name)
184 |                 if value is None:
185 |                     raise ValueError(
186 |                         f"Environment variable not found: {var_name}. "
187 |                         f"Please set {var_name} or update your .raggy.json config."
188 |                     )
189 |                 return value
190 | 
191 |             return re.sub(pattern, replace_env_var, obj)
192 |         else:
193 |             return obj
194 | 
195 |     def get(self, key_path: str, default: Any = None) -> Any:
196 |         """Get configuration value by dot-separated path.
197 | 
198 |         Args:
199 |             key_path: Dot-separated path (e.g., "vectorStore.provider")
200 |             default: Default value if key not found
201 | 
202 |         Returns:
203 |             Configuration value or default
204 | 
205 |         Example:
206 |             >>> config = RaggyConfig()
207 |             >>> config.get("vectorStore.provider")
208 |             'chromadb'
209 |             >>> config.get("vectorStore.pinecone.apiKey")
210 |             None
211 | 
212 |         """
213 |         keys = key_path.split(".")
214 |         value = self.config
215 | 
216 |         for key in keys:
217 |             if isinstance(value, dict) and key in value:
218 |                 value = value[key]
219 |             else:
220 |                 return default
221 | 
222 |         return value
223 | 
224 |     def get_vector_store_config(self) -> Dict[str, Any]:
225 |         """Get vector store configuration.
226 | 
227 |         Returns:
228 |             Dict with provider and provider-specific config
229 | 
230 |         """
231 |         return self.config.get("vectorStore", {})
232 | 
233 |     def get_embedding_config(self) -> Dict[str, Any]:
234 |         """Get embedding configuration.
235 | 
236 |         Returns:
237 |             Dict with provider and provider-specific config
238 | 
239 |         """
240 |         return self.config.get("embedding", {})
241 | 
242 |     def get_memory_categories(self) -> Dict[str, Any]:
243 |         """Get memory categories configuration.
244 | 
245 |         Returns:
246 |             Dict with categoriesMode and categories
247 | 
248 |         """
249 |         return self.config.get("memory", {})
250 | 
251 |     def get_resolved_categories(self, default_categories: set) -> set:
252 |         """Get resolved memory categories based on configuration mode.
253 | 
254 |         Args:
255 |             default_categories: Default category set
256 | 
257 |         Returns:
258 |             Set of resolved categories
259 | 
260 |         Example:
261 |             >>> config = RaggyConfig()
262 |             >>> defaults = {"decision", "solution", "pattern", "learning", "error", "note"}
263 |             >>> config.get_resolved_categories(defaults)
264 |             {'decision', 'solution', 'pattern', 'learning', 'error', 'note'}
265 | 
266 |         """
267 |         memory_config = self.get_memory_categories()
268 |         mode = memory_config.get("categoriesMode", "append")
269 |         categories_config = memory_config.get("categories", {})
270 | 
271 |         if mode == "replace":
272 |             # Use only the replacement categories
273 |             replace_list = categories_config.get("replace", [])
274 |             if not replace_list:
275 |                 raise ValueError(
276 |                     "categoriesMode is 'replace' but no replacement categories provided"
277 |                 )
278 |             return set(replace_list)
279 | 
280 |         elif mode == "custom":
281 |             # Use only custom added categories (no defaults)
282 |             add_list = categories_config.get("add", [])
283 |             if not add_list:
284 |                 raise ValueError(
285 |                     "categoriesMode is 'custom' but no categories to add provided"
286 |                 )
287 |             return set(add_list)
288 | 
289 |         else:  # mode == "append" (default)
290 |             # Start with defaults
291 |             result = set(default_categories)
292 | 
293 |             # Add custom categories
294 |             add_list = categories_config.get("add", [])
295 |             result.update(add_list)
296 | 
297 |             # Remove specified categories
298 |             remove_list = categories_config.get("remove", [])
299 |             result.difference_update(remove_list)
300 | 
301 |             return result
302 | 
303 |     def __repr__(self) -> str:
304 |         """String representation of config."""
305 |         config_source = str(self.config_path) if self.config_path else "defaults"
306 |         return f"RaggyConfig(source={config_source})"
307 | 


--------------------------------------------------------------------------------
/docs/setup-guide.md:
--------------------------------------------------------------------------------
  1 | # Setup Guide
  2 | 
  3 | Quick setup guide for getting started with Raggy, including cloud vector database configuration.
  4 | 
  5 | ## Quick Start (Local ChromaDB)
  6 | 
  7 | **1. Install Raggy:**
  8 | ```bash
  9 | pip install raggy
 10 | ```
 11 | 
 12 | **2. Initialize Project:**
 13 | ```bash
 14 | python raggy_cli.py init
 15 | ```
 16 | 
 17 | **3. Build Vector Database:**
 18 | ```bash
 19 | python raggy_cli.py build
 20 | ```
 21 | 
 22 | **4. Search Documents:**
 23 | ```bash
 24 | python raggy_cli.py search "your query"
 25 | ```
 26 | 
 27 | **5. Store Memories:**
 28 | ```bash
 29 | python raggy_cli.py remember "Fixed critical bug in authentication"
 30 | ```
 31 | 
 32 | **6. Recall Memories:**
 33 | ```bash
 34 | python raggy_cli.py recall "bug fix"
 35 | ```
 36 | 
 37 | Done! You now have a fully functional RAG system with development memory.
 38 | 
 39 | ## Interactive Cloud Setup
 40 | 
 41 | For production deployments with Pinecone or Supabase:
 42 | 
 43 | **1. Install with cloud support:**
 44 | ```bash
 45 | # For Pinecone
 46 | pip install "raggy[pinecone]"
 47 | 
 48 | # For Supabase
 49 | pip install "raggy[supabase]"
 50 | 
 51 | # For OpenAI embeddings
 52 | pip install openai
 53 | ```
 54 | 
 55 | **2. Run interactive setup:**
 56 | ```bash
 57 | python raggy_cli.py init --interactive
 58 | ```
 59 | 
 60 | **3. Follow the prompts:**
 61 | 
 62 | ```
 63 | Welcome to Raggy Interactive Setup!
 64 | 
 65 | ? Select vector database provider:
 66 |   > ChromaDB (Local - recommended for development)
 67 |     Pinecone (Cloud - serverless, auto-scaling)
 68 |     Supabase (Cloud - PostgreSQL + pgvector)
 69 | 
 70 | ? Select embedding provider:
 71 |     SentenceTransformers (Local - free, no API key)
 72 |   > OpenAI (Cloud - high quality, requires API key)
 73 | 
 74 | ? Enter OpenAI API key: sk-proj-...
 75 | ? Enter Pinecone API key: pcsk_...
 76 | ? Enter Pinecone region (e.g., us-east-1-aws): us-east-1-aws
 77 | ? Enter Pinecone index name [raggy-index]:
 78 | ? Enter embedding dimension [1536]:
 79 | 
 80 | ✓ Configuration saved to .raggy.json
 81 | ✓ Setup complete!
 82 | ```
 83 | 
 84 | **4. Test the configuration:**
 85 | ```bash
 86 | python raggy_cli.py remember "Testing cloud setup" --type note
 87 | python raggy_cli.py recall "cloud setup"
 88 | ```
 89 | 
 90 | ## Manual Configuration
 91 | 
 92 | ### Option 1: Local Development (ChromaDB + SentenceTransformers)
 93 | 
 94 | Create `.raggy.json`:
 95 | ```json
 96 | {
 97 |   "vectorStore": {
 98 |     "provider": "chromadb",
 99 |     "chromadb": {
100 |       "path": "./vectordb"
101 |     }
102 |   },
103 |   "embedding": {
104 |     "provider": "sentenceTransformers",
105 |     "sentenceTransformers": {
106 |       "model": "all-MiniLM-L6-v2"
107 |     }
108 |   }
109 | }
110 | ```
111 | 
112 | **Pros:**
113 | - ✅ Zero cost (fully local)
114 | - ✅ No API keys required
115 | - ✅ Offline support
116 | - ✅ Fast iteration
117 | 
118 | **Cons:**
119 | - ❌ Single machine only
120 | - ❌ No cloud sync
121 | - ❌ Manual scaling
122 | 
123 | ### Option 2: Cloud Production (Pinecone + OpenAI)
124 | 
125 | **Step 1: Get API Keys**
126 | 
127 | 1. **Pinecone**: Sign up at [pinecone.io](https://www.pinecone.io)
128 |    - Create API key in dashboard
129 |    - Note your environment (e.g., us-east-1-aws)
130 | 
131 | 2. **OpenAI**: Sign up at [platform.openai.com](https://platform.openai.com)
132 |    - Create API key in API Keys section
133 |    - Add billing information
134 | 
135 | **Step 2: Set Environment Variables**
136 | ```bash
137 | export PINECONE_API_KEY="pcsk_..."
138 | export OPENAI_API_KEY="sk-proj-..."
139 | ```
140 | 
141 | **Step 3: Create Pinecone Index**
142 | 
143 | Via Pinecone Console:
144 | 1. Go to Indexes → Create Index
145 | 2. Name: `raggy-index`
146 | 3. Dimensions: `1536`
147 | 4. Metric: `cosine`
148 | 5. Cloud: `aws`
149 | 6. Region: `us-east-1`
150 | 
151 | Via Python:
152 | ```python
153 | from pinecone import Pinecone, ServerlessSpec
154 | 
155 | pc = Pinecone(api_key="your-api-key")
156 | pc.create_index(
157 |     name="raggy-index",
158 |     dimension=1536,
159 |     metric="cosine",
160 |     spec=ServerlessSpec(cloud="aws", region="us-east-1")
161 | )
162 | ```
163 | 
164 | **Step 4: Create `.raggy.json`**
165 | ```json
166 | {
167 |   "vectorStore": {
168 |     "provider": "pinecone",
169 |     "pinecone": {
170 |       "apiKey": "${PINECONE_API_KEY}",
171 |       "environment": "us-east-1-aws",
172 |       "indexName": "raggy-index",
173 |       "dimension": 1536
174 |     }
175 |   },
176 |   "embedding": {
177 |     "provider": "openai",
178 |     "openai": {
179 |       "apiKey": "${OPENAI_API_KEY}",
180 |       "model": "text-embedding-3-small"
181 |     }
182 |   }
183 | }
184 | ```
185 | 
186 | **Step 5: Test**
187 | ```bash
188 | python raggy_cli.py remember "Cloud setup complete" --priority high
189 | python raggy_cli.py recall "setup"
190 | ```
191 | 
192 | **Pros:**
193 | - ✅ Auto-scaling
194 | - ✅ High quality embeddings
195 | - ✅ Multi-user support
196 | - ✅ Global low latency
197 | 
198 | **Cons:**
199 | - ❌ Requires API keys
200 | - ❌ Monthly costs (free tier available)
201 | - ❌ Internet dependency
202 | 
203 | ### Option 3: PostgreSQL Users (Supabase + SentenceTransformers)
204 | 
205 | **Step 1: Create Supabase Project**
206 | 
207 | 1. Sign up at [supabase.com](https://supabase.com)
208 | 2. Create new project
209 | 3. Wait for project initialization (~2 minutes)
210 | 
211 | **Step 2: Get Credentials**
212 | 
213 | In Supabase Dashboard:
214 | - Project URL: Settings → API → Project URL
215 | - Anon Key: Settings → API → anon/public key
216 | 
217 | **Step 3: Enable pgvector**
218 | 
219 | In SQL Editor, run:
220 | ```sql
221 | CREATE EXTENSION IF NOT EXISTS vector;
222 | ```
223 | 
224 | **Step 4: Create Match Function**
225 | 
226 | In SQL Editor, run:
227 | ```sql
228 | CREATE OR REPLACE FUNCTION match_documents(
229 |   query_embedding vector(384),
230 |   match_threshold float DEFAULT 0.0,
231 |   match_count int DEFAULT 5,
232 |   table_name text DEFAULT 'project_memory'
233 | )
234 | RETURNS TABLE (
235 |   id text,
236 |   document text,
237 |   metadata jsonb,
238 |   similarity float
239 | )
240 | LANGUAGE plpgsql
241 | AS $$
242 | BEGIN
243 |   RETURN QUERY
244 |   EXECUTE format('
245 |     SELECT id, document, metadata,
246 |            1 - (embedding <=> $1) AS similarity
247 |     FROM %I
248 |     WHERE 1 - (embedding <=> $1) > $2
249 |     ORDER BY embedding <=> $1
250 |     LIMIT $3
251 |   ', table_name)
252 |   USING query_embedding, match_threshold, match_count;
253 | END;
254 | $$;
255 | ```
256 | 
257 | **Step 5: Set Environment Variables**
258 | ```bash
259 | export SUPABASE_URL="https://xxxxx.supabase.co"
260 | export SUPABASE_ANON_KEY="eyJhbGc..."
261 | ```
262 | 
263 | **Step 6: Create `.raggy.json`**
264 | ```json
265 | {
266 |   "vectorStore": {
267 |     "provider": "supabase",
268 |     "supabase": {
269 |       "url": "${SUPABASE_URL}",
270 |       "apiKey": "${SUPABASE_ANON_KEY}",
271 |       "dimension": 384
272 |     }
273 |   },
274 |   "embedding": {
275 |     "provider": "sentenceTransformers",
276 |     "sentenceTransformers": {
277 |       "model": "all-MiniLM-L6-v2"
278 |     }
279 |   }
280 | }
281 | ```
282 | 
283 | **Step 7: Test**
284 | ```bash
285 | python raggy_cli.py remember "Supabase configured successfully"
286 | python raggy_cli.py recall "supabase"
287 | ```
288 | 
289 | **Pros:**
290 | - ✅ PostgreSQL-based (familiar SQL)
291 | - ✅ Row-level security
292 | - ✅ Integrated with Supabase ecosystem
293 | - ✅ Free tier (500 MB)
294 | - ✅ No OpenAI costs (local embeddings)
295 | 
296 | **Cons:**
297 | - ❌ More setup steps
298 | - ❌ Requires PostgreSQL knowledge
299 | - ❌ Manual scaling
300 | 
301 | ## Verifying Your Setup
302 | 
303 | ### Test Vector Database
304 | ```bash
305 | # Store a test memory
306 | python raggy_cli.py remember "Setup verification test" --type note
307 | 
308 | # Retrieve it
309 | python raggy_cli.py recall "verification"
310 | 
311 | # Expected output:
312 | # 🔍 Memory results for: 'verification'
313 | # 1. [MEMORY] 2025-11-15 12:00 | note
314 | #    Setup verification test
315 | ```
316 | 
317 | ### Test Embedding Provider
318 | ```python
319 | from raggy.core.embedding_factory import create_embedding_provider
320 | from raggy.config.raggy_config import RaggyConfig
321 | 
322 | config = RaggyConfig()
323 | embedding_provider = create_embedding_provider(config.config)
324 | 
325 | # Generate test embedding
326 | text = "Hello world"
327 | embedding = embedding_provider.embed(text)
328 | 
329 | print(f"Embedding provider: {type(embedding_provider).__name__}")
330 | print(f"Embedding dimension: {len(embedding)}")
331 | print(f"Sample values: {embedding[:5]}")
332 | 
333 | # Expected output (Pinecone + OpenAI):
334 | # Embedding provider: OpenAIProvider
335 | # Embedding dimension: 1536
336 | # Sample values: [0.123, -0.456, 0.789, ...]
337 | 
338 | # Expected output (ChromaDB + SentenceTransformers):
339 | # Embedding provider: SentenceTransformersProvider
340 | # Embedding dimension: 384
341 | # Sample values: [0.234, -0.567, 0.890, ...]
342 | ```
343 | 
344 | ### Test Full Pipeline
345 | ```bash
346 | # 1. Build document index
347 | echo "Test document content" > test.txt
348 | python raggy_cli.py build
349 | 
350 | # 2. Search documents
351 | python raggy_cli.py search "test document"
352 | 
353 | # 3. Store memory
354 | python raggy_cli.py remember "Tested full pipeline successfully"
355 | 
356 | # 4. Unified search
357 | python raggy_cli.py search "pipeline" --include-memory
358 | ```
359 | 
360 | ## Troubleshooting
361 | 
362 | ### "Module not found" errors
363 | ```bash
364 | # Pinecone
365 | pip install "pinecone[grpc]"
366 | 
367 | # Supabase
368 | pip install supabase
369 | 
370 | # OpenAI
371 | pip install openai
372 | ```
373 | 
374 | ### "API key not found"
375 | ```bash
376 | # Verify environment variables are set
377 | echo $PINECONE_API_KEY
378 | echo $OPENAI_API_KEY
379 | echo $SUPABASE_URL
380 | 
381 | # If empty, export them:
382 | export PINECONE_API_KEY="your-key"
383 | ```
384 | 
385 | ### "Index not found" (Pinecone)
386 | ```bash
387 | # Verify index exists
388 | python -c "from pinecone import Pinecone; pc = Pinecone(api_key='your-key'); print(pc.list_indexes())"
389 | 
390 | # Create if missing (see Step 3 in Pinecone setup)
391 | ```
392 | 
393 | ### "Dimension mismatch"
394 | ```
395 | Error: Vector dimension mismatch: expected 1536, got 384
396 | ```
397 | 
398 | **Fix:** Match embedding model dimension with vector database configuration:
399 | - OpenAI `text-embedding-3-small` → dimension `1536`
400 | - SentenceTransformers `all-MiniLM-L6-v2` → dimension `384`
401 | 
402 | Update `.raggy.json`:
403 | ```json
404 | {
405 |   "vectorStore": {
406 |     "pinecone": {
407 |       "dimension": 1536  // Match OpenAI
408 |     }
409 |   },
410 |   "embedding": {
411 |     "openai": {
412 |       "model": "text-embedding-3-small"  // 1536 dims
413 |     }
414 |   }
415 | }
416 | ```
417 | 
418 | ### "Table does not exist" (Supabase)
419 | Raggy creates tables automatically on first use. Verify:
420 | 1. pgvector extension is enabled: `SELECT * FROM pg_extension WHERE extname = 'vector';`
421 | 2. Your API key has table creation permissions
422 | 3. Run `match_documents` SQL function (Step 4 in Supabase setup)
423 | 
424 | ## Next Steps
425 | 
426 | - [Configuration Guide](./configuration.md) - Detailed configuration options
427 | - [Vector Databases Guide](./vector-databases.md) - In-depth cloud database setup
428 | - [Memory System](./memory-system.md) - Development memory features
429 | - [Quick Start Tutorial](./quickstart.md) - Complete tutorial
430 | 
431 | ## Getting Help
432 | 
433 | - **Documentation**: [docs/](.)
434 | - **Issues**: [GitHub Issues](https://github.com/yourusername/raggy/issues)
435 | - **Examples**: See `examples/` directory
436 | - **FAQ**: [docs/faq.md](./faq.md)
437 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
  1 | """Shared test fixtures for raggy testing."""
  2 | 
  3 | import pytest
  4 | import tempfile
  5 | import shutil
  6 | from pathlib import Path
  7 | from typing import Dict, Any, Generator
  8 | import sys
  9 | import os
 10 | 
 11 | # Add parent directory to path so we can import raggy
 12 | sys.path.insert(0, str(Path(__file__).parent.parent))
 13 | 
 14 | import raggy
 15 | 
 16 | 
 17 | @pytest.fixture
 18 | def temp_dir() -> Generator[Path, None, None]:
 19 |     """Create a temporary directory for tests."""
 20 |     temp_path = Path(tempfile.mkdtemp())
 21 |     yield temp_path
 22 |     shutil.rmtree(temp_path, ignore_errors=True)
 23 | 
 24 | 
 25 | @pytest.fixture
 26 | def sample_docs_dir(temp_dir: Path) -> Path:
 27 |     """Create a temporary docs directory with sample files."""
 28 |     docs_dir = temp_dir / "docs"
 29 |     docs_dir.mkdir()
 30 |     return docs_dir
 31 | 
 32 | 
 33 | @pytest.fixture
 34 | def sample_md_content() -> str:
 35 |     """Sample markdown content for testing."""
 36 |     return """# Test Document
 37 | 
 38 | This is a test markdown document for testing raggy functionality.
 39 | 
 40 | ## Features
 41 | 
 42 | - Feature 1: Text extraction
 43 | - Feature 2: Chunking
 44 | - Feature 3: Search
 45 | 
 46 | ## API Documentation
 47 | 
 48 | The API provides the following methods:
 49 | 
 50 | ### Search Method
 51 | 
 52 | ```python
 53 | def search(query: str) -> List[Dict[str, Any]]:
 54 |     pass
 55 | ```
 56 | 
 57 | ### Configuration
 58 | 
 59 | The system supports various configuration options:
 60 | 
 61 | - `chunk_size`: Size of text chunks (default: 1000)
 62 | - `chunk_overlap`: Overlap between chunks (default: 200)
 63 | - `model_name`: Embedding model to use
 64 | 
 65 | ## Conclusion
 66 | 
 67 | This document contains enough content to test various chunking and search scenarios.
 68 | """
 69 | 
 70 | 
 71 | @pytest.fixture
 72 | def sample_txt_content() -> str:
 73 |     """Sample text content for testing."""
 74 |     return """This is a plain text document for testing.
 75 | 
 76 | It contains multiple paragraphs with various technical terms like API, machine learning, 
 77 | user interface, and configuration settings.
 78 | 
 79 | The document discusses various aspects of software development including:
 80 | - Code quality
 81 | - Testing strategies
 82 | - Documentation practices
 83 | - Performance optimization
 84 | 
 85 | This content will be useful for testing query expansion and keyword matching.
 86 | """
 87 | 
 88 | 
 89 | @pytest.fixture
 90 | def sample_config() -> Dict[str, Any]:
 91 |     """Sample configuration for testing."""
 92 |     return {
 93 |         "search": {
 94 |             "hybrid_weight": 0.7,
 95 |             "chunk_size": 500,  # Smaller for testing
 96 |             "chunk_overlap": 100,
 97 |             "rerank": True,
 98 |             "show_scores": True,
 99 |             "context_chars": 200,
100 |             "max_results": 5,
101 |             "expansions": {
102 |                 "api": ["api", "application programming interface"],
103 |                 "ml": ["ml", "machine learning"],
104 |                 "test": ["test", "testing", "unit test"]
105 |             }
106 |         },
107 |         "models": {
108 |             "default": "all-MiniLM-L6-v2",
109 |             "fast": "paraphrase-MiniLM-L3-v2"
110 |         },
111 |         "chunking": {
112 |             "smart": False,  # Disable for predictable testing
113 |             "preserve_headers": True,
114 |             "min_chunk_size": 100,
115 |             "max_chunk_size": 800
116 |         }
117 |     }
118 | 
119 | 
120 | @pytest.fixture
121 | def mock_embedding_model():
122 |     """Mock embedding model that returns predictable embeddings."""
123 |     class MockEmbeddingModel:
124 |         def __init__(self, model_name: str):
125 |             self.model_name = model_name
126 |         
127 |         def encode(self, texts, show_progress_bar=False):
128 |             """Return mock embeddings based on text length."""
129 |             import numpy as np
130 |             embeddings = []
131 |             for text in texts:
132 |                 # Create deterministic embeddings based on text content
133 |                 # Use hash of text to ensure consistency
134 |                 seed = hash(text) % (2**31)  # Ensure positive seed
135 |                 np.random.seed(seed)
136 |                 # Create 384-dimensional embeddings (typical for MiniLM)
137 |                 embedding = np.random.normal(0, 1, 384)
138 |                 # Normalize to unit vector
139 |                 embedding = embedding / np.linalg.norm(embedding)
140 |                 embeddings.append(embedding)
141 |             return np.array(embeddings)
142 |     
143 |     return MockEmbeddingModel
144 | 
145 | 
146 | @pytest.fixture
147 | def sample_documents(sample_docs_dir: Path, sample_md_content: str, sample_txt_content: str) -> Path:
148 |     """Create sample documents for testing."""
149 |     # Create markdown file
150 |     md_file = sample_docs_dir / "test_doc.md"
151 |     md_file.write_text(sample_md_content, encoding="utf-8")
152 |     
153 |     # Create text file
154 |     txt_file = sample_docs_dir / "test_notes.txt"
155 |     txt_file.write_text(sample_txt_content, encoding="utf-8")
156 |     
157 |     # Create a README file
158 |     readme_content = """# Project README
159 | 
160 | This is a sample README file for testing document processing.
161 | 
162 | ## Installation
163 | 
164 | ```bash
165 | pip install -r requirements.txt
166 | ```
167 | 
168 | ## Usage
169 | 
170 | Run the application with:
171 | 
172 | ```bash
173 | python app.py
174 | ```
175 | """
176 |     readme_file = sample_docs_dir / "README.md"
177 |     readme_file.write_text(readme_content, encoding="utf-8")
178 |     
179 |     return sample_docs_dir
180 | 
181 | 
182 | @pytest.fixture
183 | def bm25_sample_documents() -> list:
184 |     """Sample documents for BM25 testing."""
185 |     return [
186 |         "The quick brown fox jumps over the lazy dog",
187 |         "A quick brown dog outran a quick fox",
188 |         "The dog was lazy but the fox was quick",
189 |         "Machine learning algorithms can process natural language",
190 |         "Natural language processing uses machine learning techniques",
191 |         "API documentation should be clear and comprehensive",
192 |         "The application programming interface provides REST endpoints"
193 |     ]
194 | 
195 | 
196 | @pytest.fixture
197 | def query_processor_test_cases() -> Dict[str, Dict[str, Any]]:
198 |     """Test cases for query processor."""
199 |     return {
200 |         "simple_keyword": {
201 |             "query": "machine learning",
202 |             "expected_type": "keyword",
203 |             "expected_terms": ["machine", "learning"]
204 |         },
205 |         "quoted_phrase": {
206 |             "query": '"exact phrase"',
207 |             "expected_type": "exact",
208 |             "expected_boost": True
209 |         },
210 |         "question": {
211 |             "query": "How does machine learning work?",
212 |             "expected_type": "question",
213 |             "expected_terms": ["how", "does", "machine", "learning", "work"]
214 |         },
215 |         "boolean_query": {
216 |             "query": "machine learning AND algorithms",
217 |             "expected_type": "boolean",
218 |             "expected_must_have": ["machine"]
219 |         },
220 |         "negative_query": {
221 |             "query": "machine learning -deep",
222 |             "expected_type": "boolean",
223 |             "expected_must_not": ["deep"]
224 |         },
225 |         "expandable_term": {
226 |             "query": "api documentation",
227 |             "expected_expansion": True,
228 |             "expected_contains": "application programming interface"
229 |         }
230 |     }
231 | 
232 | 
233 | # Environment setup for testing
234 | os.environ.setdefault("RAGGY_TEST_MODE", "true")
235 | 
236 | 
237 | # =============================================================================
238 | # MEMORY SYSTEM FIXTURES
239 | # =============================================================================
240 | 
241 | 
242 | @pytest.fixture
243 | def temp_db_dir(tmp_path) -> str:
244 |     """Create temporary directory for vector database."""
245 |     db_dir = str(tmp_path / "vectordb")
246 |     return db_dir
247 | 
248 | 
249 | @pytest.fixture
250 | def memory_manager(temp_db_dir):
251 |     """MemoryManager instance with real ChromaDB in temporary directory."""
252 |     from raggy.core.memory import MemoryManager
253 |     from raggy.core.chromadb_adapter import ChromaDBAdapter
254 |     import os
255 | 
256 |     # Explicitly use ChromaDB and SentenceTransformers for tests
257 |     # Bypass .raggy.json config by unsetting config discovery
258 |     chromadb_adapter = ChromaDBAdapter(path=temp_db_dir)
259 | 
260 |     # Pass config_path=None to prevent loading .raggy.json
261 |     # This ensures tests use local SentenceTransformers, not OpenAI
262 |     manager = MemoryManager(
263 |         db_dir=temp_db_dir,
264 |         quiet=True,
265 |         database=chromadb_adapter,
266 |         config_path=os.devnull  # Force config loading to fail gracefully
267 |     )
268 |     yield manager
269 |     # Cleanup handled by temp_db_dir fixture
270 | 
271 | 
272 | @pytest.fixture
273 | def memory_api(temp_db_dir):
274 |     """Memory public API instance with real ChromaDB."""
275 |     from raggy.core.memory import Memory
276 |     from raggy.core.chromadb_adapter import ChromaDBAdapter
277 |     import os
278 | 
279 |     # Explicitly use ChromaDB and SentenceTransformers for tests
280 |     chromadb_adapter = ChromaDBAdapter(path=temp_db_dir)
281 | 
282 |     # Pass config_path=None to prevent loading .raggy.json
283 |     memory = Memory(
284 |         db_dir=temp_db_dir,
285 |         quiet=True,
286 |         database=chromadb_adapter,
287 |         config_path=os.devnull  # Force config loading to fail gracefully
288 |     )
289 |     yield memory
290 |     # Cleanup handled by temp_db_dir fixture
291 | 
292 | 
293 | @pytest.fixture
294 | def sample_memory() -> Dict[str, Any]:
295 |     """Typical memory entry for tests."""
296 |     return {
297 |         "text": "Decided to use dependency injection pattern for database layer",
298 |         "memory_type": "decision",
299 |         "tags": ["architecture", "database"],
300 |         "priority": "high"
301 |     }
302 | 
303 | 
304 | @pytest.fixture
305 | def sample_memories() -> list:
306 |     """Multiple memory entries for testing various scenarios."""
307 |     return [
308 |         {
309 |             "text": "Architecture decision about using dependency injection for database layer to support multiple backends",
310 |             "memory_type": "decision",
311 |             "tags": ["architecture", "database"],
312 |             "priority": "high"
313 |         },
314 |         {
315 |             "text": "Solution to ChromaDB empty list metadata error: do not include empty lists in metadata",
316 |             "memory_type": "solution",
317 |             "tags": ["chromadb", "bug-fix"],
318 |             "priority": "medium"
319 |         },
320 |         {
321 |             "text": "Using Strategy pattern for document parsers with PDFParser, DOCXParser, MarkdownParser classes",
322 |             "memory_type": "pattern",
323 |             "tags": ["design-pattern", "document-processing"],
324 |             "priority": "medium"
325 |         },
326 |         {
327 |             "text": "Learned that semantic search requires proper embeddings for accurate retrieval",
328 |             "memory_type": "learning",
329 |             "tags": ["embeddings", "search"],
330 |             "priority": "low"
331 |         },
332 |         {
333 |             "text": "Fixed circular import by moving DatabaseManager import inside function",
334 |             "memory_type": "error",
335 |             "tags": ["imports", "debugging"],
336 |             "priority": "medium"
337 |         }
338 |     ]


--------------------------------------------------------------------------------
/docs/vector-databases.md:
--------------------------------------------------------------------------------
  1 | # Vector Database Support
  2 | 
  3 | Raggy supports multiple vector database backends for both document storage (RAG) and development memory. Choose the best option for your deployment needs.
  4 | 
  5 | ## Supported Vector Databases
  6 | 
  7 | ### ChromaDB (Default - Local)
  8 | **Best for**: Development, local projects, offline use
  9 | 
 10 | - ✅ Zero configuration required
 11 | - ✅ Fully local, no API keys needed
 12 | - ✅ Fast setup and iteration
 13 | - ✅ Automatic persistence to disk
 14 | - ❌ Single-machine only (no cloud sync)
 15 | 
 16 | **Installation:**
 17 | ```bash
 18 | pip install raggy  # ChromaDB included by default
 19 | ```
 20 | 
 21 | **Configuration (.raggy.json):**
 22 | ```json
 23 | {
 24 |   "vectorStore": {
 25 |     "provider": "chromadb",
 26 |     "chromadb": {
 27 |       "path": "./vectordb"
 28 |     }
 29 |   },
 30 |   "embedding": {
 31 |     "provider": "sentenceTransformers",
 32 |     "sentenceTransformers": {
 33 |       "model": "all-MiniLM-L6-v2"
 34 |     }
 35 |   }
 36 | }
 37 | ```
 38 | 
 39 | ### Pinecone (Cloud - Serverless)
 40 | **Best for**: Production, multi-user, cloud deployments, auto-scaling
 41 | 
 42 | - ✅ Serverless architecture (auto-scaling)
 43 | - ✅ Low latency globally distributed
 44 | - ✅ Free tier: 100K vectors
 45 | - ✅ Managed backups and high availability
 46 | - ❌ Requires API key and internet connection
 47 | 
 48 | **Installation:**
 49 | ```bash
 50 | pip install "raggy[pinecone]"
 51 | # or
 52 | pip install raggy pinecone[grpc]
 53 | ```
 54 | 
 55 | **Configuration (.raggy.json):**
 56 | ```json
 57 | {
 58 |   "vectorStore": {
 59 |     "provider": "pinecone",
 60 |     "pinecone": {
 61 |       "apiKey": "${PINECONE_API_KEY}",
 62 |       "environment": "us-east-1-aws",
 63 |       "indexName": "raggy-index",
 64 |       "dimension": 1536
 65 |     }
 66 |   },
 67 |   "embedding": {
 68 |     "provider": "openai",
 69 |     "openai": {
 70 |       "apiKey": "${OPENAI_API_KEY}",
 71 |       "model": "text-embedding-3-small"
 72 |     }
 73 |   }
 74 | }
 75 | ```
 76 | 
 77 | **Setup Steps:**
 78 | 
 79 | 1. **Create Pinecone Account**: Sign up at [pinecone.io](https://www.pinecone.io)
 80 | 
 81 | 2. **Get API Key**: Dashboard → API Keys → Create Key
 82 | 
 83 | 3. **Create Index** (via Pinecone Console or API):
 84 |    ```python
 85 |    from pinecone import Pinecone, ServerlessSpec
 86 | 
 87 |    pc = Pinecone(api_key="your-api-key")
 88 |    pc.create_index(
 89 |        name="raggy-index",
 90 |        dimension=1536,  # Match your embedding model
 91 |        metric="cosine",
 92 |        spec=ServerlessSpec(cloud="aws", region="us-east-1")
 93 |    )
 94 |    ```
 95 | 
 96 | 4. **Set Environment Variables**:
 97 |    ```bash
 98 |    export PINECONE_API_KEY="pcsk_..."
 99 |    export OPENAI_API_KEY="sk-proj-..."
100 |    ```
101 | 
102 | 5. **Initialize Raggy**:
103 |    ```bash
104 |    python raggy_cli.py init --interactive
105 |    ```
106 | 
107 | **Dimension Requirements:**
108 | - OpenAI `text-embedding-3-small`: 1536 dimensions
109 | - OpenAI `text-embedding-3-large`: 3072 dimensions
110 | - SentenceTransformers `all-MiniLM-L6-v2`: 384 dimensions
111 | 
112 | ### Supabase (Cloud - PostgreSQL + pgvector)
113 | **Best for**: Full-stack apps, existing PostgreSQL users, SQL access
114 | 
115 | - ✅ PostgreSQL-based (familiar SQL interface)
116 | - ✅ Integrated with Supabase ecosystem
117 | - ✅ Free tier: 500 MB database
118 | - ✅ Row-level security and multi-tenancy
119 | - ❌ Requires Supabase project setup
120 | 
121 | **Installation:**
122 | ```bash
123 | pip install "raggy[supabase]"
124 | # or
125 | pip install raggy supabase
126 | ```
127 | 
128 | **Configuration (.raggy.json):**
129 | ```json
130 | {
131 |   "vectorStore": {
132 |     "provider": "supabase",
133 |     "supabase": {
134 |       "url": "${SUPABASE_URL}",
135 |       "apiKey": "${SUPABASE_ANON_KEY}",
136 |       "dimension": 384
137 |     }
138 |   },
139 |   "embedding": {
140 |     "provider": "sentenceTransformers",
141 |     "sentenceTransformers": {
142 |       "model": "all-MiniLM-L6-v2"
143 |     }
144 |   }
145 | }
146 | ```
147 | 
148 | **Setup Steps:**
149 | 
150 | 1. **Create Supabase Project**: Sign up at [supabase.com](https://supabase.com)
151 | 
152 | 2. **Get Credentials**:
153 |    - Project URL: Settings → API → Project URL
154 |    - Anon Key: Settings → API → anon/public key
155 | 
156 | 3. **Enable pgvector Extension** (via SQL Editor):
157 |    ```sql
158 |    CREATE EXTENSION IF NOT EXISTS vector;
159 |    ```
160 | 
161 | 4. **Create RPC Function** (for similarity search):
162 |    ```sql
163 |    CREATE OR REPLACE FUNCTION match_documents(
164 |      query_embedding vector(384),
165 |      match_threshold float DEFAULT 0.0,
166 |      match_count int DEFAULT 5,
167 |      table_name text DEFAULT 'project_memory'
168 |    )
169 |    RETURNS TABLE (
170 |      id text,
171 |      document text,
172 |      metadata jsonb,
173 |      similarity float
174 |    )
175 |    LANGUAGE plpgsql
176 |    AS $$
177 |    BEGIN
178 |      RETURN QUERY
179 |      EXECUTE format('
180 |        SELECT id, document, metadata,
181 |               1 - (embedding <=> $1) AS similarity
182 |        FROM %I
183 |        WHERE 1 - (embedding <=> $1) > $2
184 |        ORDER BY embedding <=> $1
185 |        LIMIT $3
186 |      ', table_name)
187 |      USING query_embedding, match_threshold, match_count;
188 |    END;
189 |    $$;
190 |    ```
191 | 
192 | 5. **Set Environment Variables**:
193 |    ```bash
194 |    export SUPABASE_URL="https://xxxxx.supabase.co"
195 |    export SUPABASE_ANON_KEY="eyJhbGc..."
196 |    ```
197 | 
198 | 6. **Initialize Raggy**:
199 |    ```bash
200 |    python raggy_cli.py init --interactive
201 |    ```
202 | 
203 | ## Comparison Matrix
204 | 
205 | | Feature | ChromaDB | Pinecone | Supabase |
206 | |---------|----------|----------|----------|
207 | | **Deployment** | Local only | Cloud (serverless) | Cloud (PostgreSQL) |
208 | | **Setup Complexity** | ⭐ Easy | ⭐⭐ Moderate | ⭐⭐⭐ Advanced |
209 | | **Free Tier** | Unlimited (local) | 100K vectors | 500 MB database |
210 | | **Scaling** | Manual (single machine) | Auto-scaling | Manual (upgrade plan) |
211 | | **Multi-user** | ❌ No | ✅ Yes | ✅ Yes |
212 | | **SQL Access** | ❌ No | ❌ No | ✅ Yes |
213 | | **Latency** | <1ms (local) | 10-50ms (global) | 20-100ms (global) |
214 | | **Best Use Case** | Development, prototyping | Production apps, SaaS | Full-stack apps, PostgreSQL users |
215 | 
216 | ## Embedding Provider Pairing
217 | 
218 | ### Recommended Combinations
219 | 
220 | **Local Development:**
221 | ```json
222 | {
223 |   "vectorStore": {"provider": "chromadb"},
224 |   "embedding": {"provider": "sentenceTransformers"}
225 | }
226 | ```
227 | - Fast, no API costs
228 | - Great for prototyping
229 | 
230 | **Production (Cloud):**
231 | ```json
232 | {
233 |   "vectorStore": {"provider": "pinecone"},
234 |   "embedding": {"provider": "openai"}
235 | }
236 | ```
237 | - High quality embeddings
238 | - Scalable infrastructure
239 | - Pay-per-use pricing
240 | 
241 | **PostgreSQL Users:**
242 | ```json
243 | {
244 |   "vectorStore": {"provider": "supabase"},
245 |   "embedding": {"provider": "sentenceTransformers"}
246 | }
247 | ```
248 | - Leverage existing Supabase setup
249 | - No OpenAI costs (local embeddings)
250 | - SQL access for complex queries
251 | 
252 | ## Migration Between Databases
253 | 
254 | ### Export from ChromaDB
255 | ```python
256 | from raggy import MemoryManager
257 | 
258 | # Export memories
259 | memory = MemoryManager(db_dir="./vectordb", config_path=".raggy.json")
260 | results = memory.search("", limit=10000)  # Get all
261 | 
262 | # Save to JSON
263 | import json
264 | with open("memories_export.json", "w") as f:
265 |     json.dump(results, f)
266 | ```
267 | 
268 | ### Import to Pinecone/Supabase
269 | ```python
270 | # Update .raggy.json to new provider
271 | # Then reimport:
272 | 
273 | import json
274 | from raggy import MemoryManager
275 | 
276 | with open("memories_export.json", "r") as f:
277 |     memories = json.load(f)
278 | 
279 | memory = MemoryManager(config_path=".raggy.json")
280 | for mem in memories:
281 |     memory.add(
282 |         text=mem["text"],
283 |         memory_type=mem["metadata"].get("memory_type", "note"),
284 |         tags=mem["metadata"].get("tags", []),
285 |         priority=mem["metadata"].get("priority", "medium")
286 |     )
287 | ```
288 | 
289 | ## Configuration via Environment Variables
290 | 
291 | All API keys can use environment variable substitution:
292 | 
293 | ```json
294 | {
295 |   "vectorStore": {
296 |     "provider": "pinecone",
297 |     "pinecone": {
298 |       "apiKey": "${PINECONE_API_KEY}",
299 |       "indexName": "${PINECONE_INDEX_NAME:-raggy-index}"
300 |     }
301 |   }
302 | }
303 | ```
304 | 
305 | **Supported syntax:**
306 | - `${VAR}` - Required variable (error if missing)
307 | - `${VAR:-default}` - Optional with default value
308 | 
309 | ## Troubleshooting
310 | 
311 | ### Pinecone Issues
312 | 
313 | **"Index not found"**
314 | ```bash
315 | # Verify index exists
316 | python -c "from pinecone import Pinecone; pc = Pinecone(api_key='your-key'); print(pc.list_indexes())"
317 | ```
318 | 
319 | **"Dimension mismatch"**
320 | - Ensure `dimension` in config matches your embedding model
321 | - OpenAI text-embedding-3-small = 1536
322 | - SentenceTransformers all-MiniLM-L6-v2 = 384
323 | 
324 | **"gRPC module not found"**
325 | ```bash
326 | pip install "pinecone[grpc]"
327 | ```
328 | 
329 | ### Supabase Issues
330 | 
331 | **"exec_sql RPC not found"**
332 | - Execute the `match_documents` SQL function in Supabase SQL Editor
333 | - Verify pgvector extension is enabled: `SELECT * FROM pg_extension WHERE extname = 'vector';`
334 | 
335 | **"Table does not exist"**
336 | - Raggy creates tables automatically on first use
337 | - Verify your API key has table creation permissions
338 | 
339 | ### ChromaDB Issues
340 | 
341 | **"Database locked"**
342 | - Close other processes using the same `db_dir`
343 | - Delete `./vectordb/chroma.sqlite3-wal` if stuck
344 | 
345 | **"Collection not found"**
346 | ```bash
347 | python raggy_cli.py build  # Rebuild index
348 | ```
349 | 
350 | ## Performance Tips
351 | 
352 | ### Pinecone
353 | - Use closest region to your users (us-east-1, eu-west-1, etc.)
354 | - Batch upserts (up to 100 vectors per call)
355 | - Use namespace isolation for multi-tenancy
356 | 
357 | ### Supabase
358 | - Create indexes on metadata fields for filtered queries
359 | - Use connection pooling for high-traffic apps
360 | - Consider `pgbouncer` for connection management
361 | 
362 | ### ChromaDB
363 | - Use SSD storage for better performance
364 | - Limit collection size (<1M vectors for optimal speed)
365 | - Regular vacuum/optimize operations
366 | 
367 | ## Security Best Practices
368 | 
369 | 1. **Never commit API keys**
370 |    ```bash
371 |    # Add to .gitignore
372 |    echo ".raggy.json" >> .gitignore
373 |    ```
374 | 
375 | 2. **Use environment variables**
376 |    ```bash
377 |    export PINECONE_API_KEY="..."
378 |    export OPENAI_API_KEY="..."
379 |    ```
380 | 
381 | 3. **Rotate keys regularly**
382 |    - Pinecone: Dashboard → API Keys → Rotate
383 |    - Supabase: Settings → API → Generate New Key
384 | 
385 | 4. **Use read-only keys where possible**
386 |    - Supabase supports service role vs anon keys
387 |    - Pinecone supports read-only API keys
388 | 
389 | ## Cost Estimation
390 | 
391 | ### Pinecone
392 | - Free: 100K vectors (1536 dims)
393 | - Starter: $0.096/GB/month (~1M vectors = $15/month)
394 | - Enterprise: Volume discounts
395 | 
396 | ### Supabase
397 | - Free: 500 MB database
398 | - Pro: $25/month (8 GB)
399 | - Scale: Usage-based pricing
400 | 
401 | ### OpenAI Embeddings
402 | - text-embedding-3-small: $0.02 per 1M tokens
403 | - ~1,500 tokens = 1 document (average)
404 | - 10,000 documents ≈ $0.30
405 | 
406 | ### ChromaDB (Local)
407 | - $0 (runs on your machine)
408 | - Storage: ~200 MB per 100K vectors (384 dims)
409 | 
410 | ## See Also
411 | 
412 | - [Configuration Guide](./configuration.md) - Full config reference
413 | - [Memory System](./memory-system.md) - Development memory features
414 | - [API Reference](./api-reference.md) - Python API documentation
415 | - [Troubleshooting](./troubleshooting.md) - Common issues and solutions
416 | 


--------------------------------------------------------------------------------
/raggy_cli.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """Universal ChromaDB RAG Setup Script v2.0.0 - Entry Point.
  3 | 
  4 | This is a thin wrapper that imports the refactored raggy package.
  5 | The actual implementation is in the raggy/ package.
  6 | """
  7 | 
  8 | import argparse
  9 | import sys
 10 | from typing import Any
 11 | 
 12 | from raggy import (
 13 |     CommandFactory,
 14 |     UniversalRAG,
 15 |     __version__,
 16 |     check_for_updates,
 17 |     load_config,
 18 |     setup_dependencies,
 19 | )
 20 | from raggy.config.constants import DEFAULT_MODEL, FAST_MODEL
 21 | from raggy.utils.logging import log_error
 22 | 
 23 | 
 24 | def parse_args() -> Any:
 25 |     """Parse command line arguments."""
 26 |     parser = argparse.ArgumentParser(
 27 |         description="Universal ChromaDB RAG Setup Script v2.0.0 - Enhanced with hybrid search and smart chunking",
 28 |         formatter_class=argparse.RawDescriptionHelpFormatter,
 29 |         epilog="""
 30 | Examples:
 31 |   Setup:
 32 |     %(prog)s init                               # Initialize project environment (first-time setup)
 33 | 
 34 |   Basic Usage:
 35 |     %(prog)s build                              # Build/update index with smart chunking
 36 |     %(prog)s search "your search term"         # Semantic search with normalized scores
 37 |     %(prog)s status                             # Database statistics and configuration
 38 | 
 39 |   Enhanced Search:
 40 |     %(prog)s search "exact phrase" --hybrid    # Hybrid semantic + keyword search
 41 |     %(prog)s search "api" --expand             # Query expansion (api → application programming interface)
 42 |     %(prog)s search "documentation" --hybrid --expand # Combined hybrid + expansion
 43 | 
 44 |   Model Selection:
 45 |     %(prog)s build --model-preset multilingual  # Use multilingual model for non-English content
 46 |     %(prog)s search "query" --model-preset fast # Quick search with smaller model
 47 | 
 48 |   Output & Analysis:
 49 |     %(prog)s search "query" --json             # Enhanced JSON with score breakdown
 50 |     %(prog)s optimize                           # Benchmark semantic vs hybrid search
 51 |     %(prog)s interactive --quiet                # Interactive mode, minimal output
 52 | 
 53 |   Memory Management:
 54 |     %(prog)s remember "Fixed bug in search"    # Store development context
 55 |     %(prog)s recall "bug fix"                   # Search memories
 56 |     %(prog)s forget <memory_id>                 # Delete specific memory
 57 |     %(prog)s forget --archive --older-than 90d  # Archive old memories
 58 |     %(prog)s forget --all                       # Delete all memories (requires strict confirmation)
 59 | 
 60 |   Advanced:
 61 |     %(prog)s rebuild --config custom.yaml       # Use custom configuration
 62 |     %(prog)s search "term" --results 10        # More results with quality scores
 63 |         """,
 64 |     )
 65 | 
 66 |     parser.add_argument(
 67 |         "command",
 68 |         choices=["init", "build", "rebuild", "search", "interactive", "status", "optimize", "test", "diagnose", "validate", "remember", "recall", "forget"],
 69 |         help="Command to execute",
 70 |     )
 71 |     parser.add_argument("query", nargs="*", help="Search query (for search/recall commands), memory text (for remember command), or memory ID (for forget command)")
 72 | 
 73 |     # Options
 74 |     parser.add_argument(
 75 |         "--docs-dir", default="./docs", help="Documents directory (default: ./docs)"
 76 |     )
 77 |     parser.add_argument(
 78 |         "--db-dir",
 79 |         default="./vectordb",
 80 |         help="Vector database directory (default: ./vectordb)",
 81 |     )
 82 |     parser.add_argument(
 83 |         "--model", default="all-MiniLM-L6-v2", help="Embedding model name"
 84 |     )
 85 |     parser.add_argument(
 86 |         "--chunk-size", type=int, default=1000, help="Text chunk size (default: 1000)"
 87 |     )
 88 |     parser.add_argument(
 89 |         "--chunk-overlap",
 90 |         type=int,
 91 |         default=200,
 92 |         help="Text chunk overlap (default: 200)",
 93 |     )
 94 |     parser.add_argument(
 95 |         "--results", type=int, default=5, help="Number of search results (default: 5)"
 96 |     )
 97 | 
 98 |     # Flags
 99 |     parser.add_argument(
100 |         "--fast",
101 |         action="store_true",
102 |         help="Use faster, smaller model (paraphrase-MiniLM-L3-v2)",
103 |     )
104 |     parser.add_argument(
105 |         "--hybrid", action="store_true", help="Use hybrid semantic+keyword search"
106 |     )
107 |     parser.add_argument(
108 |         "--expand", action="store_true", help="Expand query with synonyms"
109 |     )
110 |     parser.add_argument(
111 |         "--model-preset",
112 |         choices=["fast", "balanced", "multilingual", "accurate"],
113 |         help="Use model preset (overrides --model)",
114 |     )
115 |     parser.add_argument(
116 |         "--skip-deps",
117 |         action="store_true",
118 |         help="Skip dependency checks (faster startup)",
119 |     )
120 |     parser.add_argument("--quiet", "-q", action="store_true", help="Minimal output")
121 |     parser.add_argument(
122 |         "--json", action="store_true", help="Output search results as JSON"
123 |     )
124 |     parser.add_argument(
125 |         "--config", help="Path to config file (default: raggy_config.yaml)"
126 |     )
127 |     parser.add_argument("--version", action="version", version=f"raggy {__version__}")
128 | 
129 |     # Init command specific arguments
130 |     parser.add_argument(
131 |         "--interactive",
132 |         action="store_true",
133 |         help="Force interactive setup questionnaire (for init command)"
134 |     )
135 |     parser.add_argument(
136 |         "--non-interactive",
137 |         action="store_true",
138 |         help="Skip interactive setup questionnaire (for init command)"
139 |     )
140 | 
141 |     # Remember command specific arguments
142 |     parser.add_argument(
143 |         "--file",
144 |         help="Read memory text from file (for remember command)"
145 |     )
146 |     parser.add_argument(
147 |         "--stdin",
148 |         action="store_true",
149 |         help="Read memory text from stdin (for remember command)"
150 |     )
151 |     parser.add_argument(
152 |         "--type",
153 |         choices=["decision", "solution", "pattern", "learning", "error", "note"],
154 |         default=None,
155 |         help="Memory type (for remember command, default: note; for recall command: filter by type)"
156 |     )
157 |     parser.add_argument(
158 |         "--tags",
159 |         help="Comma-separated tags (for remember command, e.g., 'api,refactor')"
160 |     )
161 |     parser.add_argument(
162 |         "--priority",
163 |         choices=["high", "medium", "low"],
164 |         default="medium",
165 |         help="Priority level (for remember command, default: medium)"
166 |     )
167 |     parser.add_argument(
168 |         "--files",
169 |         help="Comma-separated file paths involved (for remember command)"
170 |     )
171 | 
172 |     # Recall command specific arguments
173 |     parser.add_argument(
174 |         "--since",
175 |         help="Filter memories after this ISO date (for recall command, e.g., '2025-01-01')"
176 |     )
177 |     parser.add_argument(
178 |         "--last",
179 |         help="Filter memories from relative time ago (for recall command, e.g., '7d', '2w', '30d', '3m')"
180 |     )
181 |     parser.add_argument(
182 |         "--include-docs",
183 |         action="store_true",
184 |         help="Also search documentation (for recall command, unified search)"
185 |     )
186 | 
187 |     # Forget command specific arguments
188 |     parser.add_argument(
189 |         "--all",
190 |         action="store_true",
191 |         help="Delete all memories (for forget command, requires strict confirmation)"
192 |     )
193 |     parser.add_argument(
194 |         "--archive",
195 |         action="store_true",
196 |         help="Archive old memories instead of deleting (for forget command)"
197 |     )
198 |     parser.add_argument(
199 |         "--older-than",
200 |         help="Archive memories older than this time (for forget command with --archive, e.g., '90d', '6m', '1y')"
201 |     )
202 | 
203 |     # Search command enhancement
204 |     parser.add_argument(
205 |         "--include-memory",
206 |         action="store_true",
207 |         help="Also search memory (for search command, unified search)"
208 |     )
209 | 
210 |     return parser.parse_args()
211 | 
212 | 
213 | def _determine_model(args: Any) -> str:
214 |     """Determine which model to use based on arguments."""
215 |     if args.model_preset:
216 |         config = load_config(args.config)
217 |         preset_models = {
218 |             "fast": config["models"]["fast"],
219 |             "multilingual": config["models"]["multilingual"],
220 |             "accurate": config["models"]["accurate"],
221 |         }
222 |         return preset_models.get(args.model_preset, config["models"]["default"])
223 |     else:
224 |         return FAST_MODEL if args.fast else args.model
225 | 
226 | 
227 | def main() -> None:
228 |     """Main entry point using Command pattern."""
229 |     args = parse_args()
230 | 
231 |     # Check for updates early (non-intrusive, once per session)
232 |     try:
233 |         config = load_config(args.config) if hasattr(args, 'config') else {}
234 |         check_for_updates(quiet=args.quiet, config=config)
235 |     except (OSError, RuntimeError, ValueError, ConnectionError) as e:
236 |         # Update check failure - don't interrupt workflow, just log at debug level
237 |         if not args.quiet:
238 |             print(f"Debug: Update check failed: {e}")
239 | 
240 |     # Handle forget command memory_id extraction
241 |     if args.command == "forget":
242 |         # Extract memory_id from query argument if provided
243 |         if args.query and len(args.query) > 0:
244 |             args.memory_id = args.query[0]
245 |         else:
246 |             args.memory_id = None
247 | 
248 |     # Create and execute command
249 |     try:
250 |         command = CommandFactory.create_command(args.command)
251 | 
252 |         # Handle init, remember, and forget commands specially (no RAG instance needed)
253 |         if args.command in ("init", "remember", "forget"):
254 |             command.execute(args)
255 |             return
256 | 
257 |         # Setup dependencies for other commands
258 |         if not args.skip_deps:
259 |             setup_dependencies(quiet=args.quiet)
260 |         else:
261 |             # Still need to import even if skipping dependency checks
262 |             try:
263 |                 import chromadb
264 |                 import PyPDF2
265 |                 from sentence_transformers import SentenceTransformer
266 | 
267 |                 try:
268 |                     import magic
269 |                 except ImportError:
270 |                     pass
271 |             except ImportError as e:
272 |                 log_error(f"Missing dependency: {e}", quiet=args.quiet)
273 |                 log_error("Run without --skip-deps or install dependencies manually", quiet=args.quiet)
274 |                 return
275 | 
276 |         # Determine model to use
277 |         model_name = _determine_model(args)
278 | 
279 |         # Initialize RAG system
280 |         rag = UniversalRAG(
281 |             docs_dir=args.docs_dir,
282 |             db_dir=args.db_dir,
283 |             model_name=model_name,
284 |             chunk_size=args.chunk_size,
285 |             chunk_overlap=args.chunk_overlap,
286 |             quiet=args.quiet,
287 |             config_path=args.config,
288 |         )
289 | 
290 |         # Execute the command
291 |         command.execute(args, rag)
292 | 
293 |     except ValueError as e:
294 |         # Invalid command arguments or parameters
295 |         log_error(str(e), quiet=args.quiet)
296 |         sys.exit(1)
297 |     except (ImportError, ModuleNotFoundError) as e:
298 |         # Missing dependencies
299 |         log_error(f"Missing dependency executing command '{args.command}'", e, quiet=args.quiet)
300 |         sys.exit(1)
301 |     except (OSError, RuntimeError) as e:
302 |         # File system or runtime errors
303 |         log_error(f"Error executing command '{args.command}'", e, quiet=args.quiet)
304 |         sys.exit(1)
305 | 
306 | 
307 | if __name__ == "__main__":
308 |     main()


--------------------------------------------------------------------------------
/tests/test_query_processor.py:
--------------------------------------------------------------------------------
  1 | """Tests for QueryProcessor functionality."""
  2 | 
  3 | import pytest
  4 | from raggy import QueryProcessor
  5 | 
  6 | 
  7 | class TestQueryProcessor:
  8 |     """Test the QueryProcessor class."""
  9 |     
 10 |     def test_initialization_default(self):
 11 |         """Test QueryProcessor initialization with default expansions."""
 12 |         processor = QueryProcessor()
 13 |         
 14 |         # Check default expansions are loaded
 15 |         assert "api" in processor.expansions
 16 |         assert "ml" in processor.expansions
 17 |         assert "ai" in processor.expansions
 18 |         assert "ui" in processor.expansions
 19 |         assert "ux" in processor.expansions
 20 |         
 21 |         # Verify expansion contents
 22 |         assert processor.expansions["api"] == ["api", "application programming interface"]
 23 |         assert processor.expansions["ml"] == ["ml", "machine learning"]
 24 |     
 25 |     def test_initialization_custom_expansions(self):
 26 |         """Test QueryProcessor initialization with custom expansions."""
 27 |         custom_expansions = {
 28 |             "db": ["db", "database"],
 29 |             "js": ["js", "javascript"]
 30 |         }
 31 |         
 32 |         processor = QueryProcessor(custom_expansions)
 33 |         
 34 |         # Should use custom expansions only
 35 |         assert processor.expansions == custom_expansions
 36 |         assert "api" not in processor.expansions  # Default not loaded
 37 |         assert "db" in processor.expansions
 38 |         assert "js" in processor.expansions
 39 |     
 40 |     def test_detect_type_keyword(self):
 41 |         """Test detection of keyword query type."""
 42 |         processor = QueryProcessor()
 43 |         
 44 |         assert processor._detect_type("machine learning") == "keyword"
 45 |         assert processor._detect_type("python programming") == "keyword"
 46 |         assert processor._detect_type("single") == "keyword"
 47 |     
 48 |     def test_detect_type_exact(self):
 49 |         """Test detection of exact phrase query type."""
 50 |         processor = QueryProcessor()
 51 |         
 52 |         assert processor._detect_type('"exact phrase"') == "exact"
 53 |         assert processor._detect_type('"machine learning"') == "exact"
 54 |         assert processor._detect_type('"single word"') == "exact"
 55 |     
 56 |     def test_detect_type_question(self):
 57 |         """Test detection of question query type."""
 58 |         processor = QueryProcessor()
 59 |         
 60 |         assert processor._detect_type("How does this work?") == "question"
 61 |         assert processor._detect_type("What is machine learning?") == "question"
 62 |         assert processor._detect_type("Why use Python?") == "question"
 63 |         assert processor._detect_type("When should I use this?") == "question"
 64 |         assert processor._detect_type("Where can I find docs?") == "question"
 65 |         assert processor._detect_type("Who created this?") == "question"
 66 |     
 67 |     def test_detect_type_boolean(self):
 68 |         """Test detection of boolean query type."""
 69 |         processor = QueryProcessor()
 70 |         
 71 |         assert processor._detect_type("machine learning AND algorithms") == "boolean"
 72 |         assert processor._detect_type("python OR javascript") == "boolean"
 73 |         assert processor._detect_type("api -deprecated") == "boolean"
 74 |         assert processor._detect_type("search -old") == "boolean"
 75 |     
 76 |     def test_expand_query_simple(self):
 77 |         """Test simple query expansion."""
 78 |         processor = QueryProcessor()
 79 |         
 80 |         # Test API expansion
 81 |         expanded = processor._expand_query("api documentation")
 82 |         assert "application programming interface" in expanded
 83 |         assert "api" in expanded
 84 |         
 85 |         # Test ML expansion  
 86 |         expanded = processor._expand_query("ml algorithms")
 87 |         assert "machine learning" in expanded
 88 |         assert "ml" in expanded
 89 |     
 90 |     def test_expand_query_multiple_terms(self):
 91 |         """Test query expansion with multiple expandable terms."""
 92 |         processor = QueryProcessor()
 93 |         
 94 |         expanded = processor._expand_query("api and ml")
 95 |         
 96 |         # Should expand both terms
 97 |         assert "application programming interface" in expanded
 98 |         assert "machine learning" in expanded
 99 |         assert "OR" in expanded  # Should use OR syntax
100 |     
101 |     def test_expand_query_no_expansion_needed(self):
102 |         """Test query expansion when no terms need expanding."""
103 |         processor = QueryProcessor()
104 |         
105 |         original = "python programming tutorial"
106 |         expanded = processor._expand_query(original)
107 |         
108 |         # Should return same query (lowercased)
109 |         assert expanded == original.lower()
110 |     
111 |     def test_extract_operators_negative_terms(self):
112 |         """Test extraction of negative terms."""
113 |         processor = QueryProcessor()
114 |         
115 |         must_have, must_not = processor._extract_operators("machine learning -deprecated -old")
116 |         
117 |         assert must_not == ["deprecated", "old"]
118 |         assert must_have == []  # No AND terms in this example
119 |     
120 |     def test_extract_operators_and_terms(self):
121 |         """Test extraction of AND terms."""
122 |         processor = QueryProcessor()
123 |         
124 |         must_have, must_not = processor._extract_operators("machine AND learning AND algorithms")
125 |         
126 |         assert "machine" in must_have
127 |         assert "learning" in must_have  
128 |         assert must_not == []
129 |     
130 |     def test_extract_operators_mixed(self):
131 |         """Test extraction of mixed boolean operators."""
132 |         processor = QueryProcessor()
133 |         
134 |         must_have, must_not = processor._extract_operators("machine AND learning -deprecated")
135 |         
136 |         assert "machine" in must_have
137 |         assert "deprecated" in must_not
138 |     
139 |     def test_process_keyword_query(self, query_processor_test_cases):
140 |         """Test processing of keyword queries."""
141 |         processor = QueryProcessor()
142 |         
143 |         result = processor.process("machine learning")
144 |         
145 |         assert result["original"] == "machine learning"
146 |         assert result["type"] == "keyword"
147 |         assert result["boost_exact"] is False
148 |         assert "machine" in result["terms"]
149 |         assert "learning" in result["terms"]
150 |     
151 |     def test_process_exact_phrase_query(self):
152 |         """Test processing of exact phrase queries."""
153 |         processor = QueryProcessor()
154 |         
155 |         result = processor.process('"machine learning"')
156 |         
157 |         assert result["original"] == '"machine learning"'
158 |         assert result["type"] == "exact"
159 |         assert result["boost_exact"] is True
160 |         assert result["processed"] == "machine learning"
161 |         assert result["terms"] == ["machine learning"]
162 |     
163 |     def test_process_question_query(self):
164 |         """Test processing of question queries."""
165 |         processor = QueryProcessor()
166 |         
167 |         result = processor.process("How does machine learning work?")
168 |         
169 |         assert result["type"] == "question"
170 |         assert result["boost_exact"] is False
171 |         assert "how" in result["terms"]
172 |         assert "machine" in result["terms"]
173 |         assert "learning" in result["terms"]
174 |     
175 |     def test_process_boolean_query_with_negation(self):
176 |         """Test processing of boolean queries with negation."""
177 |         processor = QueryProcessor()
178 |         
179 |         result = processor.process("machine learning -deep")
180 |         
181 |         assert result["type"] == "boolean"
182 |         assert "deep" in result["must_not"]
183 |         assert result["boost_exact"] is False
184 |     
185 |     def test_process_boolean_query_with_and(self):
186 |         """Test processing of boolean queries with AND."""
187 |         processor = QueryProcessor()
188 |         
189 |         result = processor.process("machine AND learning")
190 |         
191 |         assert result["type"] == "boolean"
192 |         assert "machine" in result["must_have"]
193 |         assert result["boost_exact"] is False
194 |     
195 |     def test_process_query_with_expansion(self):
196 |         """Test processing with query expansion."""
197 |         processor = QueryProcessor()
198 |         
199 |         result = processor.process("api documentation")
200 |         
201 |         # Should expand 'api' term
202 |         assert "application programming interface" in result["processed"]
203 |         assert result["original"] == "api documentation"
204 |     
205 |     def test_process_preserves_original_query(self):
206 |         """Test that original query is preserved during processing."""
207 |         processor = QueryProcessor()
208 |         
209 |         original = "API Development Guide"
210 |         result = processor.process(original)
211 |         
212 |         assert result["original"] == original
213 |         assert result["processed"].lower() != original.lower()  # Should be different due to expansion
214 |     
215 |     def test_process_empty_query(self):
216 |         """Test processing of empty query."""
217 |         processor = QueryProcessor()
218 |         
219 |         result = processor.process("")
220 |         
221 |         assert result["original"] == ""
222 |         assert result["processed"] == ""
223 |         assert result["type"] == "keyword"  # Default type
224 |         assert result["terms"] == []
225 |     
226 |     def test_process_whitespace_only_query(self):
227 |         """Test processing of whitespace-only query."""
228 |         processor = QueryProcessor()
229 |         
230 |         result = processor.process("   \t\n   ")
231 |         
232 |         assert result["original"] == "   \t\n   "
233 |         assert result["processed"] == ""
234 |         assert result["terms"] == []
235 |     
236 |     def test_case_insensitive_expansion(self):
237 |         """Test that query expansion is case insensitive."""
238 |         processor = QueryProcessor()
239 |         
240 |         # Test uppercase
241 |         result_upper = processor.process("API documentation")
242 |         assert "application programming interface" in result_upper["processed"]
243 |         
244 |         # Test mixed case
245 |         result_mixed = processor.process("Api Documentation")  
246 |         assert "application programming interface" in result_mixed["processed"]
247 |         
248 |         # Test lowercase (already tested in other tests)
249 |         result_lower = processor.process("api documentation")
250 |         assert "application programming interface" in result_lower["processed"]
251 |     
252 |     def test_custom_expansions_work(self):
253 |         """Test that custom expansions work correctly."""
254 |         custom_expansions = {
255 |             "db": ["db", "database", "data store"],
256 |             "ui": ["ui", "user interface", "frontend"]
257 |         }
258 |         
259 |         processor = QueryProcessor(custom_expansions)
260 |         
261 |         result = processor.process("db design")
262 |         
263 |         assert "database" in result["processed"]
264 |         assert "data store" in result["processed"]
265 |     
266 |     def test_expansion_preserves_other_terms(self):
267 |         """Test that expansion preserves non-expandable terms."""
268 |         processor = QueryProcessor()
269 |         
270 |         result = processor.process("api server configuration")
271 |         
272 |         # 'api' should be expanded
273 |         assert "application programming interface" in result["processed"]
274 |         # Other terms should be preserved
275 |         assert "server" in result["processed"]
276 |         assert "configuration" in result["processed"]
277 |     
278 |     def test_multiple_exact_phrases_not_supported(self):
279 |         """Test behavior with multiple quoted phrases (edge case)."""
280 |         processor = QueryProcessor()
281 |         
282 |         # This is an edge case - typically only one quoted phrase expected
283 |         result = processor.process('"first phrase" "second phrase"')
284 |         
285 |         # Should detect as exact type and process first phrase
286 |         assert result["type"] == "exact"
287 |         # Behavior may vary, but should handle gracefully
288 |     
289 |     def test_malformed_quotes_handling(self):
290 |         """Test handling of malformed quote queries."""
291 |         processor = QueryProcessor()
292 |         
293 |         # Unmatched quote
294 |         result = processor.process('machine learning"')
295 |         # Should not be detected as exact phrase
296 |         assert result["type"] != "exact"
297 |         
298 |         # Empty quotes
299 |         result = processor.process('""')
300 |         # Should handle gracefully
301 |         assert result["type"] == "exact"
302 |     
303 |     @pytest.mark.parametrize("query,expected_type", [
304 |         ("simple query", "keyword"),
305 |         ('"exact phrase"', "exact"), 
306 |         ("How does this work?", "question"),
307 |         ("term1 AND term2", "boolean"),
308 |         ("term -exclude", "boolean"),
309 |         ("What is API?", "question"),  # Question with expandable term
310 |         ("", "keyword")  # Empty defaults to keyword
311 |     ])
312 |     def test_query_type_detection_parametrized(self, query, expected_type):
313 |         """Parametrized test for query type detection."""
314 |         processor = QueryProcessor()
315 |         result = processor.process(query)
316 |         assert result["type"] == expected_type


--------------------------------------------------------------------------------
/raggy/setup/dependencies.py:
--------------------------------------------------------------------------------
  1 | """Dependency management and auto-installation."""
  2 | 
  3 | import importlib.util
  4 | import subprocess
  5 | import sys
  6 | import time
  7 | from pathlib import Path
  8 | from typing import Any, Dict, List
  9 | 
 10 | from ..config.cache import load_deps_cache, save_deps_cache
 11 | from .environment import check_environment_setup, check_uv_available
 12 | 
 13 | 
 14 | class PackageInstaller:
 15 |     """Handles package installation with caching and validation."""
 16 | 
 17 |     # Special cases where package name differs from import name
 18 |     IMPORT_NAME_MAP = {
 19 |         "python-magic-bin": "magic",
 20 |         "python-magic": "magic",
 21 |         "python-docx": "docx",
 22 |         "pyyaml": "yaml",
 23 |         "PyPDF2": "PyPDF2",
 24 |     }
 25 | 
 26 |     def __init__(self, skip_cache: bool = False) -> None:
 27 |         """Initialize installer with cache configuration.
 28 | 
 29 |         Args:
 30 |             skip_cache: If True, skip cache and always check/install
 31 | 
 32 |         """
 33 |         self.skip_cache = skip_cache
 34 |         self.cache: Dict[str, Any] = {} if skip_cache else load_deps_cache()
 35 |         self.cache_updated = False
 36 | 
 37 |     def install_packages(self, packages: List[str], silent_fail: bool = False) -> None:
 38 |         """Install all packages if missing.
 39 | 
 40 |         Args:
 41 |             packages: List of package specifications (e.g., "chromadb>=0.4.0")
 42 |             silent_fail: If True, don't print error messages on failure
 43 | 
 44 |         """
 45 |         self._validate_environment()
 46 | 
 47 |         for package_spec in packages:
 48 |             self._install_package(package_spec, silent_fail=silent_fail)
 49 | 
 50 |         if self.cache_updated:
 51 |             save_deps_cache(self.cache)
 52 | 
 53 |     def _validate_environment(self) -> None:
 54 |         """Validate UV and environment setup.
 55 | 
 56 |         Exits with error if validation fails.
 57 |         """
 58 |         if not check_uv_available():
 59 |             sys.exit(1)
 60 | 
 61 |         env_ok, env_issue = check_environment_setup()
 62 |         if not env_ok:
 63 |             self._report_env_issue(env_issue)
 64 |             sys.exit(1)
 65 | 
 66 |     def _report_env_issue(self, env_issue: str) -> None:
 67 |         """Report specific environment issue to user.
 68 | 
 69 |         Args:
 70 |             env_issue: Type of environment issue
 71 | 
 72 |         """
 73 |         error_messages = {
 74 |             "virtual_environment": (
 75 |                 "ERROR: No virtual environment found.\n"
 76 |                 "Run 'python raggy.py init' to set up the project environment."
 77 |             ),
 78 |             "pyproject": (
 79 |                 "ERROR: No pyproject.toml found.\n"
 80 |                 "Run 'python raggy.py init' to set up the project environment."
 81 |             ),
 82 |             "invalid_venv": (
 83 |                 "ERROR: Invalid virtual environment found.\n"
 84 |                 "Delete .venv directory and run 'python raggy.py init' to recreate it."
 85 |             ),
 86 |             "missing_dependencies": (
 87 |                 "ERROR: Required dependencies are not installed.\n"
 88 |                 "If you installed raggy as a package, run: pip install 'raggy[all]'\n"
 89 |                 "If using from source, run: pip install -e '.[all]'\n"
 90 |                 "Or manually install: pip install chromadb sentence-transformers PyPDF2 python-docx"
 91 |             ),
 92 |         }
 93 |         message = error_messages.get(
 94 |             env_issue, f"ERROR: Environment issue: {env_issue}"
 95 |         )
 96 |         print(message)
 97 | 
 98 |     def _install_package(self, package_spec: str, silent_fail: bool = False) -> None:
 99 |         """Install single package if not cached or installed.
100 | 
101 |         Args:
102 |             package_spec: Package specification (e.g., "chromadb>=0.4.0")
103 |             silent_fail: If True, don't print error messages on failure
104 | 
105 |         """
106 |         package_name = self._extract_package_name(package_spec)
107 | 
108 |         # Check cache first
109 |         if not self.skip_cache and package_name in self.cache.get("installed", {}):
110 |             return
111 | 
112 |         # Check if already installed
113 |         if self._is_already_installed(package_name):
114 |             self._update_cache(package_name)
115 |             return
116 | 
117 |         # Install the package
118 |         self._perform_install(package_spec, package_name, silent_fail=silent_fail)
119 | 
120 |     def _extract_package_name(self, package_spec: str) -> str:
121 |         """Extract package name from specification.
122 | 
123 |         Args:
124 |             package_spec: Package specification like 'package>=1.0' or 'package[extra]'
125 | 
126 |         Returns:
127 |             str: Clean package name
128 | 
129 |         """
130 |         return package_spec.split(">=")[0].split("==")[0].split("[")[0]
131 | 
132 |     def _get_import_name(self, package_name: str) -> str:
133 |         """Get import name for package (may differ from package name).
134 | 
135 |         Args:
136 |             package_name: Package name as used in pip
137 | 
138 |         Returns:
139 |             str: Import name for use with importlib
140 | 
141 |         """
142 |         return self.IMPORT_NAME_MAP.get(package_name, package_name.replace("-", "_"))
143 | 
144 |     def _is_already_installed(self, package_name: str) -> bool:
145 |         """Check if package is already installed.
146 | 
147 |         Args:
148 |             package_name: Package name to check
149 | 
150 |         Returns:
151 |             bool: True if package can be imported
152 | 
153 |         """
154 |         import_name = self._get_import_name(package_name)
155 |         try:
156 |             spec = importlib.util.find_spec(import_name)
157 |             return spec is not None
158 |         except (ImportError, ModuleNotFoundError):
159 |             return False
160 | 
161 |     def _update_cache(self, package_name: str) -> None:
162 |         """Update cache with installed package timestamp.
163 | 
164 |         Args:
165 |             package_name: Package name to cache
166 | 
167 |         """
168 |         if "installed" not in self.cache:
169 |             self.cache["installed"] = {}
170 |         self.cache["installed"][package_name] = time.time()
171 |         self.cache_updated = True
172 | 
173 |     def _perform_install(self, package_spec: str, package_name: str, silent_fail: bool = False) -> None:
174 |         """Perform actual package installation.
175 | 
176 |         Args:
177 |             package_spec: Full package specification for pip
178 |             package_name: Package name for error handling
179 |             silent_fail: If True, don't print error messages on failure
180 | 
181 |         """
182 |         if not silent_fail:
183 |             print(f"Installing {package_name}...")
184 | 
185 |         # Check if we're in a virtual environment
186 |         in_venv = sys.prefix != sys.base_prefix
187 | 
188 |         try:
189 |             if in_venv:
190 |                 # In a venv, use uv without --system flag
191 |                 subprocess.check_call(["uv", "pip", "install", package_spec],
192 |                                     stdout=subprocess.DEVNULL if silent_fail else None,
193 |                                     stderr=subprocess.DEVNULL if silent_fail else None)
194 |             else:
195 |                 # Not in a venv, try uv with --system flag
196 |                 try:
197 |                     subprocess.check_call(["uv", "pip", "install", "--system", package_spec],
198 |                                         stdout=subprocess.DEVNULL if silent_fail else None,
199 |                                         stderr=subprocess.DEVNULL if silent_fail else None)
200 |                 except subprocess.CalledProcessError:
201 |                     # If uv fails, fall back to regular pip
202 |                     subprocess.check_call([sys.executable, "-m", "pip", "install", package_spec],
203 |                                         stdout=subprocess.DEVNULL if silent_fail else None,
204 |                                         stderr=subprocess.DEVNULL if silent_fail else None)
205 | 
206 |             self._update_cache(package_name)
207 |         except subprocess.CalledProcessError as e:
208 |             if not silent_fail:
209 |                 print(f"Failed to install {package_name}: {e}")
210 |             self._try_fallback_install(package_name, silent_fail=silent_fail)
211 | 
212 |     def _try_fallback_install(self, package_name: str, silent_fail: bool = False) -> None:
213 |         """Try fallback installation for special packages.
214 | 
215 |         Args:
216 |             package_name: Package that failed to install
217 |             silent_fail: If True, don't print error messages on failure
218 | 
219 |         """
220 |         if package_name != "python-magic-bin":
221 |             return
222 | 
223 |         if not silent_fail:
224 |             print("Trying alternative magic package...")
225 | 
226 |         # Check if we're in a virtual environment
227 |         in_venv = sys.prefix != sys.base_prefix
228 | 
229 |         try:
230 |             if in_venv:
231 |                 subprocess.check_call(["uv", "pip", "install", "python-magic"],
232 |                                     stdout=subprocess.DEVNULL if silent_fail else None,
233 |                                     stderr=subprocess.DEVNULL if silent_fail else None)
234 |             else:
235 |                 try:
236 |                     subprocess.check_call(["uv", "pip", "install", "--system", "python-magic"],
237 |                                         stdout=subprocess.DEVNULL if silent_fail else None,
238 |                                         stderr=subprocess.DEVNULL if silent_fail else None)
239 |                 except subprocess.CalledProcessError:
240 |                     subprocess.check_call([sys.executable, "-m", "pip", "install", "python-magic"],
241 |                                         stdout=subprocess.DEVNULL if silent_fail else None,
242 |                                         stderr=subprocess.DEVNULL if silent_fail else None)
243 | 
244 |             self._update_cache(package_name)
245 |         except subprocess.CalledProcessError:
246 |             if not silent_fail:
247 |                 print(
248 |                     "Warning: Could not install python-magic. "
249 |                     "File type detection may be limited."
250 |                 )
251 | 
252 | 
253 | def install_if_missing(packages: List[str], skip_cache: bool = False, silent_fail: bool = False) -> None:
254 |     """Auto-install required packages if missing using uv.
255 | 
256 |     Args:
257 |         packages: List of package specifications (e.g., "chromadb>=0.4.0")
258 |         skip_cache: If True, skip cache and always check/install
259 |         silent_fail: If True, don't print error messages on failure
260 | 
261 |     """
262 |     installer = PackageInstaller(skip_cache=skip_cache)
263 |     installer.install_packages(packages, silent_fail=silent_fail)
264 | 
265 | 
266 | def setup_dependencies(skip_cache: bool = False, quiet: bool = False) -> None:
267 |     """Setup dependencies with optional caching.
268 | 
269 |     Args:
270 |         skip_cache: If True, skip cache and always check/install
271 |         quiet: If True, suppress output (unused but kept for compatibility)
272 | 
273 |     """
274 | 
275 |     # Check if we're in a virtual environment
276 |     env_ok, env_issue = check_environment_setup()
277 | 
278 |     if not env_ok:
279 |         if env_issue == "missing_dependencies":
280 |             print("\nERROR: Required dependencies are not installed.")
281 |             print("\nIf you installed raggy as a package:")
282 |             print("  pip install 'raggy[all]'")
283 |             print("\nIf you're developing raggy:")
284 |             print("  pip install -e '.[all]'")
285 |             print("\nOr install manually:")
286 |             print("  pip install chromadb sentence-transformers PyPDF2 python-docx")
287 |         elif env_issue == "virtual_environment":
288 |             print("\nERROR: Local .venv exists but is not activated.")
289 |             print("\nPlease activate your virtual environment:")
290 |             if sys.platform == "win32":
291 |                 print("  .venv\\Scripts\\activate")
292 |             else:
293 |                 print("  source .venv/bin/activate")
294 |             print("\nThen run the command again.")
295 |         else:
296 |             print("\nERROR: Environment is not properly set up.")
297 |             print(f"Issue: {env_issue}")
298 |             print("\nFor local development:")
299 |             print("  python -m venv .venv")
300 |             if sys.platform == "win32":
301 |                 print("  .venv\\Scripts\\activate")
302 |             else:
303 |                 print("  source .venv/bin/activate")
304 |             print("  pip install -e '.[all]'")
305 |         sys.exit(1)
306 | 
307 |     # Environment is OK, proceed with dependency checks
308 | 
309 |     # Auto-install required packages if missing
310 |     required_packages = [
311 |         "chromadb>=0.4.0",
312 |         "sentence-transformers>=2.2.0",
313 |         "PyPDF2>=3.0.0",
314 |         "python-docx>=1.0.0",
315 |     ]
316 | 
317 |     # Add optional packages (non-blocking)
318 |     optional_packages = ["pyyaml>=6.0", "torch>=2.0.0"]
319 | 
320 |     # Platform-specific magic library is optional (for file type detection)
321 |     if sys.platform == "win32":
322 |         optional_packages.append("python-magic-bin>=0.4.14")
323 |     else:
324 |         optional_packages.append("python-magic")
325 | 
326 |     # Install required packages
327 |     install_if_missing(required_packages, skip_cache=skip_cache)
328 | 
329 |     # Try to install optional packages but don't fail if they can't be installed
330 |     for package in optional_packages:
331 |         try:
332 |             # Check if already installed before trying to install
333 |             package_name = package.split(">=")[0].split("==")[0].split("[")[0]
334 |             installer = PackageInstaller(skip_cache=skip_cache)
335 |             if not installer._is_already_installed(package_name):
336 |                 # Use silent_fail=True for optional packages
337 |                 install_if_missing([package], skip_cache=skip_cache, silent_fail=True)
338 |         except (subprocess.CalledProcessError, OSError, RuntimeError):
339 |             # Installation failed for optional package - silently continue
340 |             # This is expected for packages that may not be available in all environments
341 |             pass
342 | 


--------------------------------------------------------------------------------