├── LICENSE ├── safe_store ├── config.py ├── core │ ├── __init__.py │ ├── models.py │ └── exceptions.py ├── utils │ ├── __init__.py │ ├── concurrency.py │ └── json_parsing.py ├── depricated.py ├── indexing │ └── __init__.py ├── search │ ├── __init__.py │ └── similarity.py ├── security │ ├── __init__.py │ └── encryption.py ├── vectorization │ ├── __init__.py │ ├── methods │ │ ├── __init__.py │ │ ├── sentense_transformer │ │ │ ├── description.yaml │ │ │ └── __init__.py │ │ ├── ollama │ │ │ ├── description.yaml │ │ │ └── __init__.py │ │ ├── lollms │ │ │ ├── description.yaml │ │ │ └── __init__.py │ │ ├── openai │ │ │ └── description.yaml │ │ ├── tf_idf │ │ │ ├── description.yaml │ │ │ └── __init__.py │ │ └── cohere │ │ │ ├── description.yaml │ │ │ └── __init__.py │ ├── base.py │ ├── utils.py │ └── manager.py ├── graph │ ├── __init__.py │ └── prompts │ │ ├── entity_fusion_prompt.md │ │ ├── query_parsing_prompt.md │ │ ├── graph_extraction_prompt.md │ │ └── graph_extraction_prompt_with_ontology.md ├── __init__.py └── processing │ ├── tokenizers.py │ └── text_cleaning.py ├── examples ├── SafeStoreGraph │ ├── .gitignore │ ├── icon.png │ ├── requirements.txt │ └── description.yaml ├── requirements.txt ├── basic_usage_text.py ├── encryption_usage.py ├── dynamic_model_selection.py ├── custom_logging.py ├── metadata_generation.py ├── basic_usage.py └── graph_usage.py ├── tests ├── security │ ├── __init__.py │ └── test_encryption.py ├── fixtures │ ├── sample.docx │ ├── sample.pdf │ └── sample.html ├── test_chunking.py ├── test_store_phase4.py ├── conftest.py └── test_store_phase3.py ├── docs ├── requirements.txt ├── api.rst ├── index.rst ├── conf.py ├── installation.rst ├── logging.rst ├── quickstart.rst └── encryption.rst ├── temp_docs_point_cloud ├── animals.txt ├── tech.txt └── space.txt ├── point_cloud_web_app ├── data.json └── index.html ├── pyproject.toml ├── .gitignore └── scripts └── migration_v1_v2.py /LICENSE: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /safe_store/config.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /safe_store/core/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /safe_store/core/models.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /safe_store/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /safe_store/depricated.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /safe_store/indexing/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /safe_store/search/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /safe_store/security/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /safe_store/utils/concurrency.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /safe_store/vectorization/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /safe_store/vectorization/methods/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/SafeStoreGraph/.gitignore: -------------------------------------------------------------------------------- 1 | projects 2 | config.json -------------------------------------------------------------------------------- /tests/security/__init__.py: -------------------------------------------------------------------------------- 1 | # tests/security/__init__.py 2 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | Sphinx>=5.0 2 | sphinx-rtd-theme>=1.0 3 | # Add other Sphinx extensions if needed 4 | -------------------------------------------------------------------------------- /tests/fixtures/sample.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ParisNeo/safe_store/HEAD/tests/fixtures/sample.docx -------------------------------------------------------------------------------- /tests/fixtures/sample.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ParisNeo/safe_store/HEAD/tests/fixtures/sample.pdf -------------------------------------------------------------------------------- /examples/SafeStoreGraph/icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ParisNeo/safe_store/HEAD/examples/SafeStoreGraph/icon.png -------------------------------------------------------------------------------- /temp_docs_point_cloud/animals.txt: -------------------------------------------------------------------------------- 1 | The quick brown fox jumps over the lazy dog. A fast red fox is athletic. The sleepy dog rests. -------------------------------------------------------------------------------- /tests/fixtures/sample.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 |

This is HTML content.

5 | 6 | 7 | -------------------------------------------------------------------------------- /safe_store/graph/__init__.py: -------------------------------------------------------------------------------- 1 | # safe_store/graph/__init__.py 2 | from .graph_store import GraphStore 3 | 4 | __all__ = [ 5 | "GraphStore", 6 | ] -------------------------------------------------------------------------------- /temp_docs_point_cloud/tech.txt: -------------------------------------------------------------------------------- 1 | Python is a versatile programming language. Many developers use Python for AI. RAG pipelines are a common use case. -------------------------------------------------------------------------------- /examples/SafeStoreGraph/requirements.txt: -------------------------------------------------------------------------------- 1 | fastapi 2 | uvicorn[standard] 3 | python-multipart 4 | lollms_client>=1.3.0 5 | safe_store>=2.7.0 6 | pipmaster -------------------------------------------------------------------------------- /temp_docs_point_cloud/space.txt: -------------------------------------------------------------------------------- 1 | The sun is a star at the center of our solar system. The Earth revolves around the sun. Space exploration is fascinating. -------------------------------------------------------------------------------- /examples/requirements.txt: -------------------------------------------------------------------------------- 1 | safe-store 2 | extract-msg 3 | beautifulsoup4 4 | sentence-transformers 5 | scikit-learn 6 | pandas 7 | plotly 8 | lollms_client 9 | -------------------------------------------------------------------------------- /examples/SafeStoreGraph/description.yaml: -------------------------------------------------------------------------------- 1 | author: ParisNeo & Lollms 2 | category: Data 3 | creation_date: '2025-08-18T10:05:00.000000' 4 | description: This application allows the user to upload files and convert them to a Graph with an optional ontology. The user can then query the graph using natural language questions. The application uses SafeStore for graph storage and retrieval, and Gemini Pro as the LLM for processing and answering questions. 5 | disclaimer: null 6 | last_update_date: '2025-08-18T10:05:00.000000' 7 | model: Gemini Pro 8 | name: SafeStoreGraph 9 | version: 1.0 10 | -------------------------------------------------------------------------------- /safe_store/vectorization/methods/sentense_transformer/description.yaml: -------------------------------------------------------------------------------- 1 | title: Sentence Transformers Vectorizer 2 | author: ParisNeo 3 | creation_date: 2025-10-10 4 | last_update_date: 2025-10-10 5 | class_name: STVectorizer 6 | description: > 7 | A local vectorizer that uses models from the sentence-transformers library. 8 | Models are downloaded and run directly on your machine. 9 | input_parameters: 10 | - name: model 11 | type: str 12 | description: "The name of the Sentence Transformer model to use from the Hugging Face Hub." 13 | mandatory: true 14 | default: "all-MiniLM-L6-v2" -------------------------------------------------------------------------------- /safe_store/vectorization/methods/ollama/description.yaml: -------------------------------------------------------------------------------- 1 | title: Ollama Vectorizer 2 | author: ParisNeo 3 | creation_date: 2025-10-10 4 | last_update_date: 2025-10-10 5 | class_name: OllamaVectorizer 6 | description: > 7 | A vectorizer that uses a local Ollama instance to generate text embeddings. 8 | Requires a running Ollama server. 9 | input_parameters: 10 | - name: model 11 | type: str 12 | description: "The name of the embedding model to use from your Ollama server (e.g., 'nomic-embed-text')." 13 | mandatory: true 14 | default: "" 15 | - name: host 16 | type: str 17 | description: "The URL of the Ollama server. If not provided, it defaults to http://localhost:11434 or the OLLAMA_HOST environment variable." 18 | mandatory: false 19 | default: "" -------------------------------------------------------------------------------- /safe_store/vectorization/methods/lollms/description.yaml: -------------------------------------------------------------------------------- 1 | title: Lollms Vectorizer 2 | author: ParisNeo 3 | creation_date: 2025-10-10 4 | last_update_date: 2025-10-10 5 | class_name: LollmsVectorizer 6 | description: > 7 | A vectorizer that connects to any OpenAI-compatible API, such as a local Lollms 8 | instance, for generating embeddings. 9 | input_parameters: 10 | - name: model 11 | type: str 12 | description: "The name of the embedding model served by the Lollms instance." 13 | mandatory: true 14 | default: "nomic-embed-text" 15 | - name: base_url 16 | type: str 17 | description: "The base URL of the OpenAI-compatible API endpoint." 18 | mandatory: true 19 | default: "http://localhost:9600" 20 | - name: api_key 21 | type: str 22 | description: "The API key for the service. Often not required for local instances." 23 | mandatory: false 24 | default: "not_needed" -------------------------------------------------------------------------------- /safe_store/vectorization/methods/openai/description.yaml: -------------------------------------------------------------------------------- 1 | title: OpenAI Vectorizer 2 | author: ParisNeo 3 | creation_date: 2025-10-10 4 | last_update_date: 2025-10-10 5 | class_name: OpenAIVectorizer 6 | description: > 7 | A vectorizer that uses OpenAI's API to generate text embeddings. 8 | Requires an OpenAI API key. 9 | input_parameters: 10 | - name: model 11 | type: str 12 | description: "The name of the OpenAI embedding model to use." 13 | mandatory: true 14 | default: "text-embedding-3-small" 15 | - name: api_key 16 | type: str 17 | description: "Your OpenAI API key. If not provided, the OPENAI_API_KEY environment variable will be used." 18 | mandatory: false 19 | default: "" 20 | - name: base_url 21 | type: str 22 | description: "Optional custom base URL for the OpenAI API, for use with proxies or other compatible services." 23 | mandatory: false 24 | default: "" -------------------------------------------------------------------------------- /safe_store/vectorization/methods/tf_idf/description.yaml: -------------------------------------------------------------------------------- 1 | title: TF-IDF Vectorizer 2 | author: ParisNeo 3 | creation_date: 2025-10-10 4 | last_update_date: 2025-10-10 5 | class_name: TfidfVectorizerWrapper 6 | description: > 7 | A classic, local vectorizer based on Term Frequency-Inverse Document Frequency. 8 | This vectorizer must be 'fit' on your data, so its performance is data-dependent. 9 | It does not capture semantic meaning like deep learning models. 10 | input_parameters: 11 | - name: name 12 | type: str 13 | description: "A unique name to identify this specific fitted TF-IDF model within the database." 14 | mandatory: true 15 | default: "default_tfidf" 16 | - name: params 17 | type: dict 18 | description: "Optional dictionary of parameters to pass to the underlying scikit-learn TfidfVectorizer, such as 'ngram_range' or 'max_features'." 19 | mandatory: false 20 | default: {} -------------------------------------------------------------------------------- /safe_store/graph/prompts/entity_fusion_prompt.md: -------------------------------------------------------------------------------- 1 | # [NEW & COMPLETE] prompts/entity_fusion_prompt.md 2 | Your task is to determine if two entities of the same type are, in fact, the same entity based on their properties. 3 | 4 | **Entity Type:** {entity_label} 5 | 6 | --- 7 | 8 | **Entity A Properties:** 9 | ```json 10 | {node_a_properties} 11 | ``` 12 | 13 | --- 14 | 15 | **Entity B Properties:** 16 | ```json 17 | {node_b_properties} 18 | ``` 19 | 20 | --- 21 | 22 | **Analysis:** 23 | Carefully compare the properties of Entity A and Entity B. Do they refer to the same real-world entity? Consider variations in naming, partial information, or different levels of detail. 24 | 25 | **Output Format:** 26 | You MUST respond with only a single, well-formed JSON object in a markdown code block. The JSON object must have two keys: 27 | 1. `"is_same"`: A boolean (`true` or `false`). 28 | 2. `"reasoning"`: A brief, one-sentence explanation for your decision. 29 | 30 | **Example Response:** 31 | ```json 32 | {{ 33 | "is_same": true, 34 | "reasoning": "Both entities share the same unique identifier and have highly similar descriptive properties." 35 | }} 36 | ``` -------------------------------------------------------------------------------- /docs/api.rst: -------------------------------------------------------------------------------- 1 | === 2 | API 3 | === 4 | 5 | This section provides detailed documentation for the ``safe_store`` library's public API. 6 | 7 | Core Class 8 | ---------- 9 | 10 | .. automodule:: SafeStore.store 11 | :members: SafeStore, LogLevel 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | Exceptions 16 | ---------- 17 | 18 | .. automodule:: SafeStore.core.exceptions 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | Vectorizers 24 | ----------- 25 | 26 | .. automodule:: SafeStore.vectorization.base 27 | :members: BaseVectorizer 28 | :undoc-members: 29 | 30 | .. automodule:: SafeStore.vectorization.methods.sentence_transformer 31 | :members: SentenceTransformerVectorizer 32 | :undoc-members: 33 | 34 | .. automodule:: SafeStore.vectorization.methods.tfidf 35 | :members: TfidfVectorizerWrapper 36 | :undoc-members: 37 | 38 | Utilities 39 | --------- 40 | While primarily used internally, the ``ascii_colors`` library is exposed for configuration. 41 | 42 | .. automodule:: ascii_colors 43 | :members: ASCIIColors, LogLevel, FileHandler, Formatter, JSONFormatter 44 | :undoc-members: 45 | 46 | (Add other modules/classes as needed) 47 | -------------------------------------------------------------------------------- /safe_store/vectorization/methods/cohere/description.yaml: -------------------------------------------------------------------------------- 1 | title: Cohere Vectorizer 2 | author: ParisNeo 3 | creation_date: 2025-10-10 4 | last_update_date: 2025-10-10 5 | class_name: CohereVectorizer 6 | description: > 7 | A vectorizer that uses Cohere's API to generate text embeddings. 8 | Requires a Cohere API key, which can be provided via the 'api_key' parameter 9 | or the COHERE_API_KEY environment variable. 10 | input_parameters: 11 | - name: model 12 | type: str 13 | description: "The name of the Cohere embedding model to use." 14 | mandatory: true 15 | default: "embed-english-v3.0" 16 | - name: api_key 17 | type: str 18 | description: "Your Cohere API key. If not provided, the COHERE_API_KEY environment variable will be used." 19 | mandatory: false 20 | default: "" 21 | - name: input_type 22 | type: str 23 | description: "The type of input being embedded, e.g., 'search_document' or 'search_query'." 24 | mandatory: false 25 | default: "search_document" 26 | - name: truncate 27 | type: str 28 | description: "The truncation strategy for inputs longer than the model's context window ('NONE', 'START', 'END')." 29 | mandatory: false 30 | default: "END" -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. safe_store documentation master file, created by 2 | sphinx-quickstart on . 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to safe_store's documentation! 7 | ===================================== 8 | 9 | **safe_store** is a Python library providing a lightweight, file-based vector database using SQLite. It's designed for simplicity and efficiency, making it ideal for integrating into local Retrieval-Augmented Generation (RAG) pipelines. 10 | 11 | Key Features: 12 | 13 | * **Local SQLite Backend:** Simple, single-file database. 14 | * **Concurrency Safe:** Handles multiple processes writing via file locks. 15 | * **Multiple Vectorizers:** Supports Sentence Transformers, TF-IDF, etc. 16 | * **Document Parsing:** Handles `.txt`, `.pdf`, `.docx`, `.html`. 17 | * **Optional Encryption:** Securely store chunk text at rest. 18 | * **Informative Logging:** Clear console output via `ascii_colors`. 19 | 20 | .. toctree:: 21 | :maxdepth: 2 22 | :caption: Contents: 23 | 24 | installation 25 | quickstart 26 | logging 27 | encryption 28 | api 29 | 30 | 31 | Indices and tables 32 | ================== 33 | 34 | * :ref:`genindex` 35 | * :ref:`modindex` 36 | * :ref:`search` 37 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # docs/conf.py 2 | import os 3 | import sys 4 | sys.path.insert(0, os.path.abspath('..')) # Add project root to path 5 | import safe_store # Import your package 6 | 7 | project = 'safe_store' 8 | copyright = '2025, ParisNeo' # Update year/author 9 | author = 'ParisNeo' 10 | 11 | # Get version from package 12 | release = safe_store.__version__ 13 | 14 | extensions = [ 15 | 'sphinx.ext.autodoc', 16 | 'sphinx.ext.napoleon', # For Google/NumPy style docstrings 17 | 'sphinx.ext.intersphinx', 18 | 'sphinx.ext.viewcode', 19 | 'sphinx_rtd_theme', 20 | ] 21 | 22 | templates_path = ['_templates'] 23 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 24 | 25 | html_theme = 'sphinx_rtd_theme' 26 | html_static_path = ['_static'] 27 | 28 | # Autodoc settings 29 | autodoc_member_order = 'bysource' 30 | autodoc_default_options = { 31 | 'members': True, 32 | 'undoc-members': True, 33 | 'private-members': False, 34 | 'special-members': '__init__', # Include __init__ methods 35 | 'show-inheritance': True, 36 | } 37 | 38 | # Intersphinx settings 39 | intersphinx_mapping = { 40 | 'python': ('https://docs.python.org/3', None), 41 | 'numpy': ('https://numpy.org/doc/stable/', None), 42 | 'sklearn': ('https://scikit-learn.org/stable/', None), 43 | # Add others if needed (e.g., cryptography, sentence-transformers) 44 | } 45 | -------------------------------------------------------------------------------- /docs/installation.rst: -------------------------------------------------------------------------------- 1 | ============ 2 | Installation 3 | ============ 4 | 5 | Install ``safe_store`` using pip: 6 | 7 | .. code-block:: bash 8 | 9 | pip install safe_store 10 | 11 | Optional Dependencies 12 | --------------------- 13 | 14 | ``safe_store`` uses optional dependencies for certain features like specific vectorizers or document parsers. You can install these extras as needed: 15 | 16 | * **Sentence Transformers:** For state-of-the-art sentence embeddings. 17 | .. code-block:: bash 18 | 19 | pip install safe_store[sentence-transformers] 20 | 21 | * **TF-IDF:** For classic TF-IDF vectorization (requires scikit-learn). 22 | .. code-block:: bash 23 | 24 | pip install safe_store[tfidf] 25 | 26 | * **Document Parsing:** For handling ``.pdf``, ``.docx``, and ``.html`` files. 27 | .. code-block:: bash 28 | 29 | pip install safe_store[parsing] 30 | 31 | * **Encryption:** For encrypting chunk text at rest (requires cryptography). 32 | .. code-block:: bash 33 | 34 | pip install safe_store[encryption] 35 | 36 | * **All Features:** To install all optional dependencies at once. 37 | .. code-block:: bash 38 | 39 | pip install safe_store[all] 40 | 41 | * **Development:** To install dependencies needed for testing, building, and documentation generation. 42 | .. code-block:: bash 43 | 44 | pip install safe_store[dev] 45 | -------------------------------------------------------------------------------- /safe_store/vectorization/base.py: -------------------------------------------------------------------------------- 1 | # safe_store/vectorization/base.py 2 | from abc import ABC, abstractmethod 3 | import numpy as np 4 | from typing import List, Optional, Any 5 | 6 | class BaseVectorizer(ABC): 7 | """ 8 | Abstract base class for all vectorizer implementations within safe_store. 9 | """ 10 | 11 | def __init__(self, vectorizer_name:str="unknown"): 12 | self.vectorizer_name = vectorizer_name 13 | 14 | @abstractmethod 15 | def vectorize(self, texts: List[str]) -> np.ndarray: 16 | """Converts a list of text documents into a NumPy array of vector embeddings.""" 17 | pass 18 | 19 | @property 20 | @abstractmethod 21 | def dim(self) -> Optional[int]: 22 | """The dimension of the vectors produced by this vectorizer.""" 23 | pass 24 | 25 | @property 26 | @abstractmethod 27 | def dtype(self) -> np.dtype: 28 | """The NumPy data type of the vector embeddings.""" 29 | pass 30 | 31 | def get_tokenizer(self) -> Optional[Any]: 32 | """ 33 | Returns the tokenizer associated with the vectorizer, if available. 34 | 35 | The returned tokenizer should have `encode` and `decode` methods 36 | compatible with libraries like Hugging Face's tokenizers. 37 | 38 | Returns: 39 | A tokenizer object or None if no tokenizer is available client-side. 40 | """ 41 | return None 42 | 43 | @staticmethod 44 | def list_models(**kwargs) -> List[str]: 45 | """ 46 | Lists the available models for this vectorizer. 47 | This method should be overridden by subclasses that support model listing. 48 | """ 49 | return [] -------------------------------------------------------------------------------- /safe_store/vectorization/utils.py: -------------------------------------------------------------------------------- 1 | # safe_store/vectorization/utils.py 2 | import importlib.util 3 | from pathlib import Path 4 | from typing import Any 5 | from ..core.exceptions import ConfigurationError 6 | 7 | def load_vectorizer_module(vectorizer_name: str, custom_vectorizers_path: str = None) -> Any: 8 | """Dynamically loads a vectorizer module from built-in methods or a custom path.""" 9 | 10 | # First, try loading from the custom path if provided 11 | if custom_vectorizers_path: 12 | custom_path = Path(custom_vectorizers_path) / vectorizer_name / "__init__.py" 13 | if custom_path.exists(): 14 | try: 15 | spec = importlib.util.spec_from_file_location(f"custom_vectorizers.{vectorizer_name}", custom_path) 16 | if spec and spec.loader: 17 | module = importlib.util.module_from_spec(spec) 18 | spec.loader.exec_module(module) 19 | return module 20 | except Exception as e: 21 | raise ConfigurationError(f"Failed to load custom vectorizer '{vectorizer_name}' from {custom_path}: {e}") from e 22 | 23 | # If not in custom, try built-in methods 24 | builtin_path = Path(__file__).parent / "methods" / vectorizer_name / "__init__.py" 25 | if builtin_path.exists(): 26 | try: 27 | module_name = f"safe_store.vectorization.methods.{vectorizer_name}" 28 | return importlib.import_module(module_name) 29 | except Exception as e: 30 | raise ConfigurationError(f"Failed to load built-in vectorizer '{vectorizer_name}': {e}") from e 31 | 32 | raise FileNotFoundError(f"Vectorizer module '{vectorizer_name}' not found in built-in methods or custom path.") -------------------------------------------------------------------------------- /safe_store/graph/prompts/query_parsing_prompt.md: -------------------------------------------------------------------------------- 1 | Parse the following query to identify main entities ("seed_nodes"). 2 | Format the output STRICTLY as a JSON object. 3 | **The entire JSON output MUST be enclosed in a single markdown code block starting with ```json and ending with ```.** 4 | 5 | JSON structure: 6 | ```json 7 | {{ 8 | "seed_nodes": [ 9 | {{"label": "EntityType", "identifying_property_key": "property_name", "identifying_property_value": "property_value"}} 10 | ], 11 | "target_relationships": [ {{"type": "REL_TYPE", "direction": "outgoing|incoming|any"}} ], 12 | "target_node_labels": ["Label1", "Label2"], 13 | "max_depth": 1 14 | }}``` 15 | - "seed_nodes": List of main entities from the query. 16 | - "label": The type of the entity. 17 | - "identifying_property_key": The name of the property that identifies the entity (e.g., "name", "title"). 18 | - "identifying_property_value": The value of that identifying property. 19 | - "target_relationships" (Optional): Desired relationship types and directions. 20 | - "target_node_labels" (Optional): Desired types of neighbor nodes. 21 | - "max_depth" (Optional, default 1): Traversal depth. 22 | 23 | Example Query: "Who is Evelyn Reed and what companies is she associated with?" 24 | Example JSON (wrapped in ```json ... ```): 25 | ```json 26 | {{ 27 | "seed_nodes": [ {{"label": "Person", "identifying_property_key": "name", "identifying_property_value": "Evelyn Reed"}} ], 28 | "target_relationships": [ {{"type": "WORKS_AT", "direction": "any"}}, {{"type": "CEO_OF", "direction": "any"}} ], 29 | "target_node_labels": ["Company", "Organization"], 30 | "max_depth": 1 31 | }} 32 | ``` 33 | 34 | If no clear entities, return `{{ "seed_nodes": [] }}`. 35 | 36 | Query: --- {natural_language_query} --- Parsed JSON Query (wrapped in ```json ... ```): -------------------------------------------------------------------------------- /safe_store/vectorization/methods/tf_idf/__init__.py: -------------------------------------------------------------------------------- 1 | # safe_store/vectorization/methods/tf_idf/__init__.py 2 | import numpy as np 3 | from typing import List, Optional, Dict, Any 4 | import pickle 5 | from safe_store.vectorization.base import BaseVectorizer 6 | from safe_store.core.exceptions import ConfigurationError, VectorizationError 7 | import pipmaster as pm 8 | 9 | class_name = "TfIdfVectorizer" 10 | 11 | class TfIdfVectorizer(BaseVectorizer): 12 | def __init__(self, model_config: Dict[str, Any], cache_folder: Optional[str] = None): 13 | super().__init__("tfidf") 14 | pm.ensure_packages(["scikit-learn"]) 15 | from sklearn.feature_extraction.text import TfidfVectorizer as SklearnTfidfVectorizer 16 | 17 | self.vectorizer = SklearnTfidfVectorizer() 18 | self._fitted = False 19 | self._dim = None 20 | 21 | def fit(self, texts: List[str]): 22 | self.vectorizer.fit(texts) 23 | self._fitted = True 24 | self._dim = len(self.vectorizer.get_feature_names_out()) 25 | 26 | def vectorize(self, texts: List[str]) -> np.ndarray: 27 | if not self._fitted: 28 | raise VectorizationError("TF-IDF vectorizer must be fitted before vectorizing.") 29 | return self.vectorizer.transform(texts).toarray().astype(np.float32) 30 | 31 | @property 32 | def dim(self) -> Optional[int]: 33 | return self._dim 34 | 35 | @property 36 | def dtype(self) -> np.dtype: 37 | return np.float32 38 | 39 | def get_params_to_store(self) -> Dict[str, Any]: 40 | return {"vectorizer_pickle": pickle.dumps(self.vectorizer)} 41 | 42 | @staticmethod 43 | def list_models(**kwargs) -> List[str]: 44 | """TF-IDF is a data-dependent model, not a pre-trained one. It has one 'model' type.""" 45 | return ["tfidf"] -------------------------------------------------------------------------------- /tests/test_chunking.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from safe_store.indexing.chunking import chunk_text 3 | 4 | def test_chunk_simple(): 5 | text = "abcdefghijklmnopqrstuvwxyz" 6 | chunks = chunk_text(text, chunk_size=10, chunk_overlap=3) 7 | # Expected: 8 | # abcdefghij (0, 10) 9 | # hijklmnopq (7, 17) 10 | # opqrstuvwx (14, 24) 11 | # uvwxyz (21, 26) 12 | assert len(chunks) == 4 13 | assert chunks[0] == ("abcdefghij", 0, 10) 14 | assert chunks[1] == ("hijklmnopq", 7, 17) 15 | assert chunks[2] == ("opqrstuvwx", 14, 24) 16 | assert chunks[3] == ('vwxyz', 21, 26) 17 | 18 | def test_chunk_no_overlap(): 19 | text = "abcde fghij klmno" 20 | chunks = chunk_text(text, chunk_size=5, chunk_overlap=0) 21 | assert len(chunks) == 4 22 | assert chunks[0] == ("abcde", 0, 5) 23 | assert chunks[1] == (" fghi", 5, 10) # Note space included 24 | assert chunks[2] == ("j klm", 10, 15) 25 | assert chunks[3] == ("no", 15, 17) 26 | 27 | 28 | def test_chunk_large_overlap_error(): 29 | with pytest.raises(ValueError): 30 | chunk_text("abc", chunk_size=5, chunk_overlap=5) 31 | 32 | def test_chunk_smaller_than_size(): 33 | text = "short" 34 | chunks = chunk_text(text, chunk_size=10, chunk_overlap=2) 35 | assert len(chunks) == 1 36 | assert chunks[0] == ("short", 0, 5) 37 | 38 | def test_chunk_exact_size(): 39 | text = "exactsize!" # 10 chars 40 | chunks = chunk_text(text, chunk_size=10, chunk_overlap=2) 41 | assert len(chunks) == 1 42 | assert chunks[0] == ("exactsize!", 0, 10) 43 | 44 | def test_chunk_edge_case_overlap(): 45 | # Test where overlap calculation might stall if not handled 46 | text = "1234567890" 47 | chunks = chunk_text(text, chunk_size=5, chunk_overlap=4) 48 | # 12345 (0, 5) 49 | # 23456 (1, 6) 50 | # 34567 (2, 7) 51 | # ... 52 | # 67890 (5, 10) 53 | assert len(chunks) == 6 54 | assert chunks[0] == ("12345", 0, 5) 55 | assert chunks[-1] == ("67890", 5, 10) -------------------------------------------------------------------------------- /safe_store/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | safe_store: Simple SQLite Vector Store for RAG. 3 | 4 | A Python utility library providing a lightweight, efficient, and file-based 5 | vector database using SQLite. Optimized for easy integration into 6 | Retrieval-Augmented Generation (RAG) pipelines for Large Language Models (LLMs). 7 | Includes optional encryption, concurrency control, and graph data capabilities. 8 | """ 9 | 10 | from .store import SafeStore, LogLevel, TEMP_FILE_DB_INDICATOR, IN_MEMORY_DB_INDICATOR, DEFAULT_LOCK_TIMEOUT 11 | from .graph.graph_store import GraphStore 12 | from .core.exceptions import ( # Expose exceptions for users 13 | SafeStoreError, 14 | DatabaseError, 15 | FileHandlingError, 16 | ParsingError, 17 | IndexingError, 18 | VectorizationError, 19 | QueryError, 20 | ConfigurationError, 21 | ConcurrencyError, 22 | EncryptionError, 23 | # Graph specific exceptions 24 | GraphError, 25 | GraphDBError, 26 | GraphProcessingError, 27 | LLMCallbackError, 28 | ) 29 | from .indexing.parser import SAFE_STORE_SUPPORTED_FILE_EXTENSIONS, parse_document 30 | from .processing.text_cleaning import basic_text_cleaner # Expose the basic cleaner as a utility 31 | from ascii_colors import ASCIIColors # Expose for user configuration convenience 32 | 33 | __version__ = "3.3.2" # Version bump to reflect API changes 34 | 35 | __all__ = [ 36 | "SafeStore", 37 | "GraphStore", 38 | "ASCIIColors", 39 | "LogLevel", 40 | # Exceptions 41 | "SafeStoreError", 42 | "DatabaseError", 43 | "FileHandlingError", 44 | "ParsingError", 45 | "IndexingError", 46 | "VectorizationError", 47 | "QueryError", 48 | "ConfigurationError", 49 | "ConcurrencyError", 50 | "EncryptionError", 51 | "GraphError", 52 | "GraphDBError", 53 | "GraphProcessingError", 54 | "LLMCallbackError", 55 | # globals 56 | "SAFE_STORE_SUPPORTED_FILE_EXTENSIONS", 57 | "TEMP_FILE_DB_INDICATOR", 58 | "IN_MEMORY_DB_INDICATOR", 59 | "DEFAULT_LOCK_TIMEOUT", 60 | # utilities 61 | "parse_document", 62 | "basic_text_cleaner" 63 | ] -------------------------------------------------------------------------------- /safe_store/graph/prompts/graph_extraction_prompt.md: -------------------------------------------------------------------------------- 1 | **CRITICAL INSTRUCTION: You are a data extraction expert. Your task is to extract entities (nodes) and relationships from the provided text, strictly adhering to the ontology schema below.** 2 | 3 | - **ONLY** extract nodes whose `label` is explicitly defined in the "NODE LABELS" section of the ontology. 4 | - For each extracted node, **ONLY** include properties that are listed for that specific label in the ontology. Be exhaustive and extract every property defined in the ontology that is present in the text. 5 | - **ONLY** create relationships where the `type` is explicitly defined in the "RELATIONSHIP TYPES" section. 6 | - You **MUST** respect the `Source` and `Target` constraints for relationships if they are specified. 7 | - If an entity or relationship in the text does not fit the ontology, **DO NOT** extract it. 8 | - Every node's `properties` object **MUST** contain an `identifying_value`. This is a unique name or identifier for the entity (e.g., "John Doe", "Acme Corporation") and is used to link relationships. 9 | - Format the output as a single JSON object inside a markdown code block. 10 | 11 | **User Guidance (Follow these additional instructions within the ontology's constraints):** 12 | {user_guidance} 13 | --- 14 | 15 | **Text to process:** 16 | {chunk_text} 17 | --- 18 | 19 | **JSON Output Structure (Populate this structure according to the rules):** 20 | ```json 21 | {{ 22 | "nodes": [ 23 | {{ 24 | "label": "LabelFromOntology", 25 | "properties": {{ 26 | "identifying_value": "A unique value for this entity (MANDATORY)", 27 | "property_from_ontology": "Value from text", 28 | "...": "..." 29 | }} 30 | }} 31 | ], 32 | "relationships": [ 33 | {{ 34 | "source_node_label": "SourceLabelFromOntology", 35 | "source_node_identifying_value": "Identifier of the source node", 36 | "target_node_label": "TargetLabelFromOntology", 37 | "target_node_identifying_value": "Identifier of the target node", 38 | "type": "RelationshipTypeFromOntology", 39 | "properties": {{ 40 | "role": "A role or description if applicable" 41 | }} 42 | }} 43 | ] 44 | }} 45 | ``` -------------------------------------------------------------------------------- /point_cloud_web_app/data.json: -------------------------------------------------------------------------------- 1 | [{"x": -0.38227444887161255, "y": -0.5533952116966248, "chunk_id": 1, "document_title": "animals.txt", "document_path": "C:\\Users\\aloui\\Documents\\ai\\safe_store\\temp_docs_point_cloud\\animals.txt", "metadata": {"topic": "animals", "source": "fiction"}}, {"x": -0.38085994124412537, "y": -0.5823900103569031, "chunk_id": 2, "document_title": "animals.txt", "document_path": "C:\\Users\\aloui\\Documents\\ai\\safe_store\\temp_docs_point_cloud\\animals.txt", "metadata": {"topic": "animals", "source": "fiction"}}, {"x": -0.3778142035007477, "y": -0.40828418731689453, "chunk_id": 3, "document_title": "animals.txt", "document_path": "C:\\Users\\aloui\\Documents\\ai\\safe_store\\temp_docs_point_cloud\\animals.txt", "metadata": {"topic": "animals", "source": "fiction"}}, {"x": 0.5525932908058167, "y": -0.06589919328689575, "chunk_id": 4, "document_title": "tech.txt", "document_path": "C:\\Users\\aloui\\Documents\\ai\\safe_store\\temp_docs_point_cloud\\tech.txt", "metadata": {"topic": "technology", "source": "documentation"}}, {"x": 0.7413166165351868, "y": -0.11253535002470016, "chunk_id": 5, "document_title": "tech.txt", "document_path": "C:\\Users\\aloui\\Documents\\ai\\safe_store\\temp_docs_point_cloud\\tech.txt", "metadata": {"topic": "technology", "source": "documentation"}}, {"x": 0.6865788698196411, "y": 0.0834011361002922, "chunk_id": 6, "document_title": "tech.txt", "document_path": "C:\\Users\\aloui\\Documents\\ai\\safe_store\\temp_docs_point_cloud\\tech.txt", "metadata": {"topic": "technology", "source": "documentation"}}, {"x": -0.47032028436660767, "y": 0.5269608497619629, "chunk_id": 7, "document_title": "space.txt", "document_path": "C:\\Users\\aloui\\Documents\\ai\\safe_store\\temp_docs_point_cloud\\space.txt", "metadata": {"topic": "space", "source": "science"}}, {"x": -0.38039878010749817, "y": 0.6285385489463806, "chunk_id": 8, "document_title": "space.txt", "document_path": "C:\\Users\\aloui\\Documents\\ai\\safe_store\\temp_docs_point_cloud\\space.txt", "metadata": {"topic": "space", "source": "science"}}, {"x": -0.09704269468784332, "y": 0.44058260321617126, "chunk_id": 9, "document_title": "space.txt", "document_path": "C:\\Users\\aloui\\Documents\\ai\\safe_store\\temp_docs_point_cloud\\space.txt", "metadata": {"topic": "space", "source": "science"}}, {"x": 0.10822149366140366, "y": 0.04302079603075981, "chunk_id": 10, "document_title": "space.txt", "document_path": "C:\\Users\\aloui\\Documents\\ai\\safe_store\\temp_docs_point_cloud\\space.txt", "metadata": {"topic": "space", "source": "science"}}] -------------------------------------------------------------------------------- /safe_store/processing/tokenizers.py: -------------------------------------------------------------------------------- 1 | # safe_store/processing/tokenizers.py 2 | from typing import Dict, Any, List 3 | from abc import ABC, abstractmethod 4 | from safe_store.core.exceptions import ConfigurationError 5 | 6 | class TokenizerWrapper(ABC): 7 | """An abstract base class for a standardized tokenizer interface.""" 8 | @abstractmethod 9 | def encode(self, text: str) -> List[int]: 10 | pass 11 | 12 | @abstractmethod 13 | def decode(self, tokens: List[int]) -> str: 14 | pass 15 | 16 | class TikTokenWrapper(TokenizerWrapper): 17 | """A wrapper for tiktoken's Encoding object.""" 18 | def __init__(self, tokenizer: Any): 19 | self.tokenizer = tokenizer 20 | 21 | def encode(self, text: str) -> List[int]: 22 | return self.tokenizer.encode(text) 23 | 24 | def decode(self, tokens: List[int]) -> str: 25 | # tiktoken's decode does not take extra arguments 26 | return self.tokenizer.decode(tokens) 27 | 28 | class HuggingFaceTokenizerWrapper(TokenizerWrapper): 29 | """A wrapper for Hugging Face's tokenizer objects.""" 30 | def __init__(self, tokenizer: Any): 31 | self.tokenizer = tokenizer 32 | 33 | def encode(self, text: str) -> List[int]: 34 | return self.tokenizer.encode(text) 35 | 36 | def decode(self, tokens: List[int]) -> str: 37 | # Hugging Face tokenizers use skip_special_tokens 38 | return self.tokenizer.decode(tokens, skip_special_tokens=True) 39 | 40 | 41 | def get_tokenizer(config: Dict[str, Any]) -> TokenizerWrapper: 42 | """ 43 | Loads and returns a wrapped tokenizer based on the provided configuration. 44 | """ 45 | if not isinstance(config, dict) or "name" not in config: 46 | raise ValueError("Custom tokenizer configuration must be a dictionary with a 'name' key.") 47 | 48 | tokenizer_name = config["name"] 49 | 50 | if tokenizer_name == "tiktoken": 51 | try: 52 | import tiktoken 53 | except ImportError: 54 | raise ConfigurationError("The 'tiktoken' library is required. Please run: pip install tiktoken") 55 | 56 | model = config.get("model") 57 | if not model: 58 | raise ValueError("The 'tiktoken' tokenizer requires a 'model' key (e.g., 'cl100k_base').") 59 | 60 | try: 61 | tokenizer_instance = tiktoken.get_encoding(model) 62 | return TikTokenWrapper(tokenizer_instance) 63 | except Exception as e: 64 | raise ConfigurationError(f"Failed to load tiktoken encoding '{model}': {e}") from e 65 | 66 | else: 67 | raise ValueError(f"Unknown custom tokenizer name: '{tokenizer_name}'") -------------------------------------------------------------------------------- /safe_store/graph/prompts/graph_extraction_prompt_with_ontology.md: -------------------------------------------------------------------------------- 1 | **CRITICAL INSTRUCTION: You are a data extraction expert. Your task is to extract entities (nodes) and relationships from the provided text, strictly adhering to the ontology schema below.** 2 | 3 | - **ONLY** extract nodes whose `label` is explicitly defined in the "NODE LABELS" section of the ontology. **If an entity in the text does not match a label in the ontology, you MUST ignore it and not include it in the output.** 4 | - For each extracted node, **ONLY** include properties that are listed for that specific label in the ontology. Be exhaustive and extract every property defined in the ontology that is present in the text. 5 | - **ONLY** create relationships where the `type` is explicitly defined in the "RELATIONSHIP TYPES" section. 6 | - You **MUST** respect the `Source` and `Target` constraints for relationships if they are specified. 7 | - If an entity or relationship in the text does not fit the ontology, **DO NOT** extract it. 8 | - Every node's `properties` object **MUST** contain an `identifying_value`. This is a unique name or identifier for the entity (e.g., "John Doe", "Acme Corporation") and is used to link relationships. 9 | - Format the output as a single JSON object inside a markdown code block. 10 | 11 | **User Guidance (Follow these additional instructions within the ontology's constraints):** 12 | {user_guidance} 13 | --- 14 | 15 | **Text to process:** 16 | {chunk_text} 17 | --- 18 | 19 | **JSON Output Structure (Populate this structure according to the rules):** 20 | ```json 21 | {{ 22 | "nodes": [ 23 | {{ 24 | "label": "LabelFromOntology", 25 | "properties": {{ 26 | "identifying_value": "A unique value for this entity (MANDATORY)", 27 | "property_from_ontology": "Value from text", 28 | "...": "..." 29 | }} 30 | }} 31 | ], 32 | "relationships": [ 33 | {{ 34 | "source_node_label": "SourceLabelFromOntology", 35 | "source_node_identifying_value": "Identifier of the source node", 36 | "target_node_label": "TargetLabelFromOntology", 37 | "target_node_identifying_value": "Identifier of the target node", 38 | "type": "RelationshipTypeFromOntology", 39 | "properties": {{ 40 | "role": "A role or description if applicable" 41 | }} 42 | }} 43 | ] 44 | }} 45 | ``` 46 | **WARNING:** 47 | - For nodes, `label` and `properties.identifying_value` are mandatory. 48 | - For relationships, `source_node_label`, `source_node_identifying_value`, `target_node_label`, `target_node_identifying_value`, and `type` are mandatory. -------------------------------------------------------------------------------- /safe_store/core/exceptions.py: -------------------------------------------------------------------------------- 1 | # safe_store/core/exceptions.py 2 | 3 | class SafeStoreError(Exception): 4 | """Base class for all safe_store specific errors.""" 5 | pass 6 | 7 | class DatabaseError(SafeStoreError): 8 | """Errors related to database operations (connection, schema, query, transaction).""" 9 | pass 10 | 11 | class FileHandlingError(SafeStoreError): 12 | """Errors related to file system operations (reading, writing, hashing, not found).""" 13 | pass 14 | 15 | class ParsingError(FileHandlingError): 16 | """Errors occurring during document parsing (subclass of FileHandlingError).""" 17 | pass 18 | 19 | class ConfigurationError(SafeStoreError): 20 | """Errors related to invalid configuration, setup, or missing optional dependencies.""" 21 | pass 22 | 23 | class IndexingError(SafeStoreError): 24 | """Errors specifically within the document indexing pipeline (chunking, storage logic).""" 25 | # Note: ParsingError, VectorizationError cover sub-steps. This is for orchestration. 26 | pass 27 | 28 | class VectorizationError(SafeStoreError): 29 | """Errors related to vectorization processes (model loading, encoding, fitting).""" 30 | pass 31 | 32 | class QueryError(SafeStoreError): 33 | """Errors occurring during query execution (similarity calculation, result fetching).""" 34 | pass 35 | 36 | class ConcurrencyError(SafeStoreError): 37 | """Errors related to file locking or concurrent access issues (e.g., timeouts).""" 38 | pass 39 | 40 | class EncryptionError(SafeStoreError): 41 | """Errors related to data encryption or decryption.""" 42 | pass 43 | 44 | # --- New Graph-related Exceptions --- 45 | class GraphError(SafeStoreError): 46 | """Base class for graph-specific errors.""" 47 | pass 48 | 49 | class GraphDBError(DatabaseError, GraphError): # Inherits from DatabaseError and GraphError 50 | """Errors related to graph database operations.""" 51 | pass 52 | 53 | class GraphProcessingError(GraphError): 54 | """Errors occurring during the processing of text to extract graph elements.""" 55 | pass 56 | 57 | class LLMCallbackError(GraphProcessingError): 58 | """Errors related to the LLM processing callback function.""" 59 | pass 60 | 61 | class NodeNotFoundError(GraphError): 62 | """Errors occurring during the processing of text to extract graph elements.""" 63 | pass 64 | class RelationshipNotFoundError(GraphError): 65 | """Errors occurring during the processing of text to extract graph elements.""" 66 | pass 67 | class DocumentNotFoundError(GraphError): 68 | """Errors occurring during the processing of text or file.""" 69 | pass 70 | 71 | class GraphEntityFusionError(GraphProcessingError): 72 | """Errors related to the entity fusion process, including LLM decisions.""" 73 | pass -------------------------------------------------------------------------------- /examples/basic_usage_text.py: -------------------------------------------------------------------------------- 1 | from safe_store import SafeStore 2 | from pathlib import Path 3 | 4 | # --- Cleanup --- 5 | # Ensure the database from previous runs is removed for a clean start 6 | db_file = Path("basic_usage_store.db") 7 | db_file.unlink(missing_ok=True) 8 | Path(f"{db_file}.lock").unlink(missing_ok=True) 9 | 10 | 11 | # --- 1. Initialize the store with a fixed configuration --- 12 | # All indexing parameters (vectorizer, chunking, cleaning) are now defined 13 | # when the SafeStore instance is created. 14 | print("--- Initializing SafeStore with a fixed configuration ---") 15 | ss = SafeStore( 16 | db_path=db_file, 17 | name="my_database", 18 | description="A cool database demonstrating fixed configuration", 19 | 20 | # Vectorizer Configuration 21 | vectorizer_name="st", 22 | vectorizer_config={"model": "all-MiniLM-L6-v2"}, 23 | 24 | # Chunking and Processing Configuration 25 | chunk_size=10, # Small chunk size for demonstration (in tokens) 26 | chunk_overlap=2, # Small overlap (in tokens) 27 | chunking_strategy='token', # Use the model's tokenizer for chunking 28 | expand_before=5, # Add 5 tokens of context before the vectorized chunk 29 | expand_after=5, # Add 5 tokens of context after the vectorized chunk 30 | text_cleaner='basic' # Use the built-in basic text cleaner 31 | ) 32 | 33 | # --- 2. Add content --- 34 | # The add_text method is now much simpler. It uses the configuration 35 | # provided when the store was created. 36 | print("\n--- Adding content to the store ---") 37 | text_to_add = "The quick brown fox jumps over the lazy dog. This sentence is used to demonstrate all letters of the alphabet. It is a classic pangram." 38 | ss.add_text( 39 | unique_id="pangram_text", 40 | text=text_to_add 41 | ) 42 | print(f"Added text with ID 'pangram_text'.") 43 | 44 | 45 | # --- 3. Query the store --- 46 | # The query method also uses the instance's configured vectorizer automatically. 47 | print("\n--- Querying the store ---") 48 | query = "a speedy fox" 49 | results = ss.query(query) 50 | 51 | print(f"Query: '{query}'") 52 | for r in results: 53 | print("-" * 20) 54 | print(f"Similarity: {r['similarity_percent']:.2f}%") 55 | # The 'chunk_text' returned is the EXPANDED text for better context. 56 | print(f"Stored (expanded) chunk: '{r['chunk_text']}'") 57 | 58 | # --- 4. Vectorize text directly (optional) --- 59 | # This method uses the instance's configured vectorizer. 60 | print("\n--- Vectorizing a new sentence directly ---") 61 | v1 = ss.vectorize_text("Hello there") 62 | print(f"Successfully vectorized a new sentence. Vector dimension: {v1.shape}") 63 | 64 | # The store is automatically closed if used in a 'with' block, 65 | # or you can call ss.close() manually. 66 | ss.close() 67 | print("\n--- Example finished ---") -------------------------------------------------------------------------------- /safe_store/processing/text_cleaning.py: -------------------------------------------------------------------------------- 1 | # safe_store/processing/text_cleaning.py 2 | import re 3 | from typing import Callable, Union 4 | 5 | def basic_text_cleaner(text: str) -> str: 6 | """ 7 | An enhanced text cleaner that performs several common cleanup tasks, designed 8 | to be safe for code and structured text while improving quality for LLMs. 9 | 10 | - Normalizes all line endings to a single newline character (`\n`). 11 | - Removes non-printable ASCII control characters (except tab and newline) that 12 | can break LLM tokenizers. 13 | - Preserves leading whitespace (indentation) on each line, which is crucial for code. 14 | - Replaces repetitive dot sequences (e.g., '....') with a standard ellipsis ('...'). 15 | - Collapses multiple spaces *within* a line into a single space, but leaves indentation untouched. 16 | - Reduces three or more consecutive newlines down to just two, preserving paragraph 17 | breaks without creating excessive empty space. Single newlines are kept. 18 | 19 | Args: 20 | text: The input string to clean. 21 | 22 | Returns: 23 | The cleaned string. 24 | """ 25 | if not isinstance(text, str): 26 | return "" 27 | 28 | # 1. Normalize line endings to \n. 29 | text = text.replace('\r\n', '\n').replace('\r', '\n') 30 | 31 | # 2. Remove non-printable control characters except for tab, newline. 32 | text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text) 33 | 34 | # 3. Replace long sequences of dots with a standard ellipsis. 35 | text = re.sub(r'\.{4,}', '...', text) 36 | 37 | # 4. Process line by line to preserve indentation while cleaning inline spaces. 38 | lines = text.split('\n') 39 | cleaned_lines = [] 40 | for line in lines: 41 | # Separate leading whitespace (indentation) from the rest of the content 42 | match = re.match(r'^(\s*)', line) 43 | leading_whitespace = match.group(1) if match else "" 44 | content = line[len(leading_whitespace):] 45 | 46 | # Collapse multiple spaces in the content part only 47 | cleaned_content = re.sub(r' {2,}', ' ', content) 48 | 49 | cleaned_lines.append(leading_whitespace + cleaned_content) 50 | 51 | text = '\n'.join(cleaned_lines) 52 | 53 | # 5. Reduce 3 or more newlines to a maximum of two. 54 | text = re.sub(r'\n{3,}', '\n\n', text) 55 | 56 | return text.strip() 57 | 58 | 59 | def get_cleaner(cleaner: Union[str, Callable[[str], str], None]) -> Callable[[str], str]: 60 | """ 61 | Returns a callable cleaner function. 62 | 63 | Args: 64 | cleaner: Can be the name of a predefined cleaner ('basic') or a custom 65 | callable function. If None, returns an identity function that 66 | does nothing. 67 | 68 | Returns: 69 | A callable function that takes a string and returns a string. 70 | """ 71 | if cleaner is None: 72 | return lambda x: x # Identity function 73 | if callable(cleaner): 74 | return cleaner 75 | if isinstance(cleaner, str): 76 | if cleaner == 'basic': 77 | return basic_text_cleaner 78 | else: 79 | raise ValueError(f"Unknown predefined cleaner: '{cleaner}'") 80 | raise TypeError("cleaner must be a string, a callable, or None") -------------------------------------------------------------------------------- /examples/encryption_usage.py: -------------------------------------------------------------------------------- 1 | # examples/encryption_usage.py 2 | """ 3 | Demonstrates using safe_store's encryption feature. 4 | """ 5 | import safe_store 6 | from pathlib import Path 7 | import shutil 8 | import sqlite3 9 | 10 | # --- Configuration --- 11 | DB_FILE = "encrypted_example_store.db" 12 | ENCRYPTION_KEY = "this-is-my-secret-Pa$$wOrd!" 13 | DOC_DIR = Path("temp_docs_encrypted") 14 | VECTORIZER_NAME = "st" 15 | VECTORIZER_CONFIG = {"model": "all-MiniLM-L6-v2"} 16 | 17 | def print_header(title): 18 | print("\n" + "="*10 + f" {title} " + "="*10) 19 | 20 | def cleanup(): 21 | print_header("Cleaning Up") 22 | for p in [DB_FILE, f"{DB_FILE}.lock", f"{DB_FILE}-wal", f"{DB_FILE}-shm"]: 23 | Path(p).unlink(missing_ok=True) 24 | if DOC_DIR.exists(): shutil.rmtree(DOC_DIR) 25 | print("- Cleanup complete.") 26 | 27 | if __name__ == "__main__": 28 | cleanup() 29 | 30 | DOC_DIR.mkdir(exist_ok=True) 31 | doc_path = DOC_DIR / "secret_notes.txt" 32 | doc_path.write_text("Project Phoenix: Launch date is Q4. Key personnel: Alice, Bob.") 33 | 34 | # --- 1. Initialize SafeStore WITH Encryption Key and Vectorizer --- 35 | print_header("Initializing Encrypted Store") 36 | store_encrypted = safe_store.SafeStore( 37 | DB_FILE, 38 | vectorizer_name=VECTORIZER_NAME, 39 | vectorizer_config=VECTORIZER_CONFIG, 40 | log_level=safe_store.LogLevel.INFO, 41 | encryption_key=ENCRYPTION_KEY 42 | ) 43 | 44 | # --- 2. Add Document to Encrypted Store --- 45 | print_header("Adding Document (Encrypted)") 46 | with store_encrypted: 47 | store_encrypted.add_document(doc_path, metadata={"sensitivity": "high"}) 48 | print(f"- Added '{doc_path.name}'.") 49 | 50 | # Direct DB check 51 | conn = sqlite3.connect(store_encrypted.db_path) 52 | is_encrypted_flag = conn.execute("SELECT is_encrypted FROM chunks LIMIT 1").fetchone()[0] 53 | conn.close() 54 | if is_encrypted_flag == 1: 55 | print("[VERIFIED] Direct DB check: is_encrypted flag is set.") 56 | else: 57 | print("[WARNING] Direct DB check: is_encrypted flag is NOT set.") 58 | 59 | # --- 3. Query Encrypted Store (With Key) --- 60 | print_header("Querying Encrypted Store (With Key)") 61 | query = "project personnel" 62 | results = store_encrypted.query(query, top_k=1) 63 | if results: 64 | print(f" Text: '{results[0]['chunk_text']}'") 65 | assert "[Encrypted" not in results[0]['chunk_text'] 66 | 67 | # --- 4. Access Encrypted DB WITHOUT the Key --- 68 | print_header("Accessing Encrypted Store WITHOUT Key") 69 | store_no_key = safe_store.SafeStore(DB_FILE, vectorizer_name=VECTORIZER_NAME, vectorizer_config=VECTORIZER_CONFIG) 70 | with store_no_key: 71 | results_no_key = store_no_key.query("security protocol", top_k=1) 72 | if results_no_key: 73 | print(f" Text: '{results_no_key[0]['chunk_text']}'") 74 | assert results_no_key[0]['chunk_text'] == "[Encrypted - Key Unavailable]" 75 | 76 | # --- 5. Access Encrypted DB with WRONG Key --- 77 | print_header("Accessing Encrypted Store With WRONG Key") 78 | store_wrong_key = safe_store.SafeStore(DB_FILE, vectorizer_name=VECTORIZER_NAME, vectorizer_config=VECTORIZER_CONFIG, encryption_key="wrong-key") 79 | with store_wrong_key: 80 | results_wrong_key = store_wrong_key.query("launch date", top_k=1) 81 | if results_wrong_key: 82 | print(f" Text: '{results_wrong_key[0]['chunk_text']}'") 83 | assert results_wrong_key[0]['chunk_text'] == "[Encrypted - Decryption Failed]" -------------------------------------------------------------------------------- /safe_store/utils/json_parsing.py: -------------------------------------------------------------------------------- 1 | import re 2 | import json 3 | from ascii_colors import ASCIIColors, trace_exception 4 | 5 | import re 6 | import json 7 | from collections import OrderedDict 8 | 9 | def robust_json_parser(json_string: str) -> dict: 10 | json_string = re.sub(r"^```(?:json)?\s*|\s*```$", '', json_string.strip()) 11 | 12 | try: 13 | return json.loads(json_string) 14 | except json.JSONDecodeError as ex: 15 | err = ex 16 | 17 | json_match = re.search(r'(\{[\s\S]*\}|\[[\s\S]*\])', json_string) 18 | cleaned_string = json_match.group(0) if json_match else json_string 19 | 20 | try: 21 | cleaned_string = re.sub(r'\bTrue\b', 'true', cleaned_string) 22 | cleaned_string = re.sub(r'\bFalse\b', 'false', cleaned_string) 23 | cleaned_string = re.sub(r'\bNone\b', 'null', cleaned_string) 24 | cleaned_string = re.sub(r'\b(undefined|NaN|Infinity|-Infinity)\b', 'null', cleaned_string) 25 | 26 | cleaned_string = re.sub(r'//.*', '', cleaned_string) 27 | cleaned_string = re.sub(r'/\*[\s\S]*?\*/', '', cleaned_string) 28 | 29 | cleaned_string = re.sub(r'\\([_`*#\-])', r'\1', cleaned_string) 30 | cleaned_string = re.sub(r',\s*(\}|\])', r'\1', cleaned_string) 31 | 32 | cleaned_string = re.sub(r'\}\s*\{', '},{', cleaned_string) 33 | 34 | def escape_newlines_in_strings(text: str) -> str: 35 | in_string = False 36 | result = [] 37 | i = 0 38 | while i < len(text): 39 | c = text[i] 40 | if c == '"' and (i == 0 or text[i - 1] != '\\'): 41 | in_string = not in_string 42 | if in_string and c == '\n': 43 | result.append('\\n') 44 | else: 45 | result.append(c) 46 | i += 1 47 | return ''.join(result) 48 | 49 | cleaned_string = escape_newlines_in_strings(cleaned_string) 50 | 51 | def escape_unescaped_inner_quotes(text: str) -> str: 52 | def fix(match): 53 | s = match.group(0) 54 | inner = s[1:-1] 55 | inner_fixed = re.sub(r'(? str: 80 | stack = [] 81 | for c in s: 82 | if c in "{[": 83 | stack.append(c) 84 | elif c in "}]": 85 | if stack and ((stack[-1] == '{' and c == '}') or (stack[-1] == '[' and c == ']')): 86 | stack.pop() 87 | for opener in reversed(stack): 88 | s += '}' if opener == '{' else ']' 89 | return s 90 | 91 | cleaned_string = balance_brackets(cleaned_string) 92 | 93 | return json.loads(cleaned_string, object_pairs_hook=OrderedDict) 94 | 95 | except json.JSONDecodeError as e: 96 | raise ValueError(f"Failed to parse JSON. Final error: {e}") from e 97 | -------------------------------------------------------------------------------- /docs/logging.rst: -------------------------------------------------------------------------------- 1 | ======= 2 | Logging 3 | ======= 4 | 5 | ``safe_store`` uses the `ascii_colors `_ library for internal logging, providing clear, leveled, and colorful console output by default. 6 | 7 | Default Behavior 8 | ---------------- 9 | 10 | * Logs are printed directly to the console (stderr). 11 | * Messages are color-coded based on severity (DEBUG, INFO, SUCCESS, WARNING, ERROR, CRITICAL). 12 | * The default logging level is ``INFO``. This means only messages with severity INFO, SUCCESS, WARNING, ERROR, and CRITICAL will be displayed. DEBUG messages are hidden. 13 | 14 | Changing the Log Level 15 | ---------------------- 16 | 17 | You can easily change the minimum severity level displayed when initializing ``safe_store``: 18 | 19 | .. code-block:: python 20 | 21 | import safe_store 22 | from safe_store import LogLevel # Or from ascii_colors import LogLevel 23 | 24 | # Show only warnings and errors 25 | store_warn = safe_store.SafeStore("my_store_warn.db", log_level=LogLevel.WARNING) 26 | 27 | # Show all messages, including detailed debug info 28 | store_debug = safe_store.SafeStore("my_store_debug.db", log_level=LogLevel.DEBUG) 29 | 30 | Advanced Configuration (Global) 31 | ------------------------------- 32 | 33 | Since ``safe_store`` uses ``ascii_colors``, you can configure logging globally for your entire application *before* initializing ``safe_store``. This allows you to: 34 | 35 | * Log messages to a file. 36 | * Change the output format. 37 | * Use JSON formatting. 38 | * Add multiple handlers (e.g., log DEBUG to file, INFO to console). 39 | * Disable console logging entirely. 40 | 41 | Here's how to configure ``ascii_colors`` globally: 42 | 43 | .. code-block:: python 44 | 45 | import safe_store 46 | from ascii_colors import ASCIIColors, LogLevel, FileHandler, Formatter, JSONFormatter 47 | import logging # Standard logging Formatter can also be used 48 | 49 | # --- Example 1: Set global level and log to file --- 50 | ASCIIColors.set_log_level(LogLevel.DEBUG) # Apply DEBUG level globally 51 | 52 | # Create a file handler 53 | log_file = "app_activity.log" 54 | file_handler = FileHandler(log_file, encoding='utf-8') 55 | 56 | # Set a specific format for the file 57 | file_formatter = Formatter( 58 | "%(asctime)s - %(name)s - %(levelname)s - %(message)s", 59 | datefmt="%Y-%m-%d %H:%M:%S" 60 | ) 61 | file_handler.setFormatter(file_formatter) 62 | 63 | # Add the file handler to ascii_colors 64 | ASCIIColors.add_handler(file_handler) 65 | 66 | # Optional: If you ONLY want file logging, remove the default console handler 67 | # default_console_handler = ASCIIColors.get_default_handler() 68 | # if default_console_handler: 69 | # ASCIIColors.remove_handler(default_console_handler) 70 | 71 | print(f"Logging DEBUG and above to console (default) and {log_file}") 72 | 73 | # Now initialize safe_store - it will respect the global settings 74 | store = safe_store.SafeStore("my_store.db") 75 | # ... use store ... 76 | # safe_store's internal DEBUG messages will now appear in the file 77 | 78 | 79 | # --- Example 2: JSON logging to file --- 80 | # Clear previous handlers if starting fresh configuration 81 | # ASCIIColors.reset() # Or ASCIIColors.clear_handlers() 82 | 83 | # ASCIIColors.set_log_level(LogLevel.INFO) # Set desired level 84 | 85 | # json_handler = FileHandler("app_log.jsonl", encoding='utf-8') 86 | # json_formatter = JSONFormatter() 87 | # json_handler.setFormatter(json_formatter) 88 | # ASCIIColors.add_handler(json_handler) 89 | 90 | # # Optionally remove console handler 91 | # # default_console_handler = ASCIIColors.get_default_handler() 92 | # # if default_console_handler: ASCIIColors.remove_handler(default_console_handler) 93 | 94 | # store_json = safe_store.SafeStore("my_json_store.db") 95 | # ... use store_json ... 96 | 97 | See the `ascii_colors documentation `_ for more details on handlers and formatters. 98 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["hatchling"] 3 | build-backend = "hatchling.build" 4 | 5 | [project] 6 | name = "safe-store" 7 | # Bump version for this release 8 | version = "3.3.3" 9 | description = "Simple, concurrent SQLite-based vector store optimized for local RAG pipelines, with optional encryption." 10 | readme = "README.md" 11 | requires-python = ">=3.8" 12 | license = { file = "LICENSE" } 13 | authors = [ 14 | { name = "ParisNeo", email = "parisneo_ai@gmail.com" }, 15 | ] 16 | keywords = ["vector", "database", "sqlite", "rag", "llm", "embedding", "semantic search", "local", "concurrent", "encryption", "webui"] 17 | classifiers = [ 18 | "Development Status :: 5 - Production/Stable", # Or "5 - Production/Stable" if ready 19 | "Programming Language :: Python :: 3", 20 | "Programming Language :: Python :: 3.8", 21 | "Programming Language :: Python :: 3.9", 22 | "Programming Language :: Python :: 3.10", 23 | "Programming Language :: Python :: 3.11", 24 | "Programming Language :: Python :: 3.12", 25 | "License :: OSI Approved :: Apache Software License", 26 | "Operating System :: OS Independent", 27 | "Intended Audience :: Developers", 28 | "Intended Audience :: Science/Research", 29 | "Topic :: Database", 30 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 31 | "Topic :: Software Development :: Libraries :: Python Modules", 32 | "Topic :: Text Processing :: Indexing", 33 | "Topic :: Security :: Cryptography", # Added classifier 34 | ] 35 | # Core dependencies 36 | dependencies = [ 37 | "pipmaster>=1.0.8", 38 | "numpy>=1.21", 39 | "ascii_colors>=0.11.6", # For logging/console output 40 | "filelock>=3.9", # For concurrency control 41 | "sqlalchemy" 42 | ] 43 | 44 | [project.optional-dependencies] 45 | # --- Parsing Libraries --- 46 | parsing = [ 47 | "pypdf>=3.10", 48 | "python-docx>=1.0", 49 | "beautifulsoup4>=4.11", 50 | "lxml>=4.9", # Recommended HTML parser 51 | ] 52 | 53 | # --- Vectorizer Libraries --- 54 | sentence-transformers = ["sentence-transformers==4.1.0"] 55 | tfidf = ["scikit-learn>=1.0"] 56 | # Future vectorizers can be added here: 57 | openai = ["openai>=1.0"] 58 | ollama = ["ollama"] 59 | 60 | # --- Encryption Library --- 61 | encryption = ["cryptography>=40.0"] # Added Encryption 62 | 63 | # --- Combined Extras --- 64 | all-vectorizers = [ 65 | "safe-store[sentence-transformers]", 66 | "safe-store[tfidf]", 67 | "safe-store[ollama]", 68 | "safe-store[openai]", 69 | ] 70 | # Combined extra for everything (parsing, all vectorizers, encryption, webui) 71 | all = [ 72 | "safe-store[parsing]", 73 | "safe-store[all-vectorizers]", 74 | "safe-store[encryption]", 75 | ] 76 | # Extra for development dependencies (testing, linting, building, docs) 77 | dev = [ 78 | "safe-store[all]", # Dev should include all optional features 79 | "pytest>=7.0", 80 | "pytest-cov>=3.0", 81 | "flake8>=5.0", # Or ruff 82 | "black>=22.0", 83 | "mypy>=0.9", 84 | "types-filelock", 85 | "types-cryptography", # Added types for cryptography 86 | # Documentation tools 87 | "Sphinx>=5.0", 88 | "sphinx-rtd-theme>=1.0", 89 | "lxml", # Needed again for docutils used by sphinx if not already installed 90 | # Build tools 91 | "hatchling", 92 | "wheel", 93 | "twine", # For checking/uploading packages 94 | ] 95 | 96 | [project.urls] 97 | Homepage = "https://github.com/ParisNeo/safe_store" 98 | Repository = "https://github.com/ParisNeo/safe_store" 99 | Documentation = "https://github.com/ParisNeo/safe_store#readme" # Link to README initially, update later if dedicated docs site exists 100 | Issues = "https://github.com/ParisNeo/safe_store/issues" 101 | 102 | # --- Tool Configurations --- 103 | [tool.black] 104 | line-length = 88 105 | target-version = ['py38'] 106 | 107 | [tool.hatch.version] 108 | path = "safe_store/__init__.py" 109 | 110 | [tool.hatch.build] 111 | include = [ 112 | "safe_store" 113 | ] 114 | -------------------------------------------------------------------------------- /tests/security/test_encryption.py: -------------------------------------------------------------------------------- 1 | # tests/security/test_encryption.py 2 | import pytest 3 | from safe_store.security.encryption import Encryptor, CRYPTOGRAPHY_AVAILABLE 4 | from safe_store.core.exceptions import EncryptionError, ConfigurationError 5 | 6 | # Conditionally skip tests if cryptography is not installed 7 | pytestmark = pytest.mark.skipif(not CRYPTOGRAPHY_AVAILABLE, reason="Requires cryptography library") 8 | 9 | @pytest.fixture 10 | def password() -> str: 11 | return "test-password-123!" 12 | 13 | @pytest.fixture 14 | def encryptor_instance(password: str) -> Encryptor: 15 | return Encryptor(password) 16 | 17 | def test_encryptor_init_no_password(): 18 | encryptor = Encryptor(None) 19 | assert not encryptor.is_enabled 20 | with pytest.raises(EncryptionError, match="Encryption is not enabled"): 21 | encryptor.encrypt("test") 22 | with pytest.raises(EncryptionError, match="Decryption is not enabled"): 23 | encryptor.decrypt(b"somebytes") 24 | 25 | def test_encryptor_init_empty_password(): 26 | with pytest.raises(ValueError, match="non-empty string"): 27 | Encryptor("") 28 | 29 | def test_encryptor_init_with_password(encryptor_instance: Encryptor): 30 | assert encryptor_instance.is_enabled 31 | assert encryptor_instance._fernet is not None 32 | 33 | def test_derive_key_consistency(password: str): 34 | """Ensure the same password yields the same key (due to fixed salt).""" 35 | key1 = Encryptor._derive_key(password) 36 | key2 = Encryptor._derive_key(password) 37 | assert key1 == key2 38 | assert isinstance(key1, bytes) 39 | 40 | def test_derive_key_different_passwords(password: str): 41 | key1 = Encryptor._derive_key(password) 42 | key2 = Encryptor._derive_key(password + "extra") 43 | assert key1 != key2 44 | 45 | def test_encrypt_decrypt_success(encryptor_instance: Encryptor): 46 | original_data = "This is sensitive data." 47 | encrypted_token = encryptor_instance.encrypt(original_data) 48 | assert isinstance(encrypted_token, bytes) 49 | assert encrypted_token != original_data.encode('utf-8') 50 | 51 | decrypted_data = encryptor_instance.decrypt(encrypted_token) 52 | assert isinstance(decrypted_data, str) 53 | assert decrypted_data == original_data 54 | 55 | def test_encrypt_non_string(encryptor_instance: Encryptor): 56 | with pytest.raises(TypeError, match="must be a string"): 57 | encryptor_instance.encrypt(b"bytes data") # type: ignore 58 | with pytest.raises(TypeError, match="must be a string"): 59 | encryptor_instance.encrypt(123) # type: ignore 60 | 61 | def test_decrypt_non_bytes(encryptor_instance: Encryptor): 62 | with pytest.raises(TypeError, match="must be bytes"): 63 | encryptor_instance.decrypt("string data") # type: ignore 64 | with pytest.raises(TypeError, match="must be bytes"): 65 | encryptor_instance.decrypt(123) # type: ignore 66 | 67 | def test_decrypt_invalid_token(encryptor_instance: Encryptor): 68 | invalid_token = b"not_a_valid_fernet_token" 69 | with pytest.raises(EncryptionError, match="Invalid token"): 70 | encryptor_instance.decrypt(invalid_token) 71 | 72 | def test_decrypt_tampered_token(encryptor_instance: Encryptor): 73 | original_data = "Original message." 74 | encrypted_token = encryptor_instance.encrypt(original_data) 75 | # Tamper slightly (e.g., flip a bit - simplistic tamper) 76 | tampered_token = bytearray(encrypted_token) 77 | tampered_token[-1] = tampered_token[-1] ^ 1 # Flip last bit 78 | tampered_token_bytes = bytes(tampered_token) 79 | 80 | with pytest.raises(EncryptionError, match="Invalid token"): 81 | encryptor_instance.decrypt(tampered_token_bytes) 82 | 83 | def test_decrypt_wrong_key(password: str): 84 | encryptor1 = Encryptor(password) 85 | encryptor2 = Encryptor(password + "_different") 86 | 87 | original_data = "Secret info." 88 | encrypted_token = encryptor1.encrypt(original_data) 89 | 90 | # Attempt decryption with the wrong key 91 | with pytest.raises(EncryptionError, match="Invalid token"): 92 | encryptor2.decrypt(encrypted_token) 93 | -------------------------------------------------------------------------------- /docs/quickstart.rst: -------------------------------------------------------------------------------- 1 | ========== 2 | Quick Start 3 | ========== 4 | 5 | Here's a basic example demonstrating indexing and querying: 6 | 7 | .. code-block:: python 8 | 9 | import safe_store 10 | from pathlib import Path 11 | import time # For demonstrating concurrency 12 | 13 | # --- 1. Prepare Sample Documents --- 14 | doc_dir = Path("my_docs") 15 | doc_dir.mkdir(exist_ok=True) 16 | doc1_path = doc_dir / "doc1.txt" 17 | doc1_path.write_text("safe_store makes local vector storage simple and efficient.", encoding='utf-8') 18 | doc2_path = doc_dir / "doc2.html" 19 | doc2_path.write_text("

HTML content can also be indexed.

", encoding='utf-8') 20 | 21 | print(f"Created sample files in: {doc_dir.resolve()}") 22 | 23 | # --- 2. Initialize safe_store --- 24 | # Use DEBUG level for more verbose output, adjust lock timeout if needed 25 | # Add encryption_key="your-secret-password" to enable encryption 26 | store = safe_store.SafeStore( 27 | "my_vector_store.db", 28 | log_level=safe_store.LogLevel.DEBUG, 29 | lock_timeout=10 # Wait up to 10s for write lock 30 | # encryption_key="your-secret-password" # Uncomment to enable 31 | ) 32 | 33 | # Best practice: Use safe_store as a context manager 34 | try: 35 | with store: 36 | # --- 3. Add Documents (acquires write lock) --- 37 | print("\n--- Indexing Documents ---") 38 | # Requires safe_store[sentence-transformers] 39 | store.add_document(doc1_path, vectorizer_name="st:all-MiniLM-L6-v2", chunk_size=50, chunk_overlap=10) 40 | 41 | # Requires safe_store[parsing] for HTML 42 | store.add_document(doc2_path, vectorizer_name="st:all-MiniLM-L6-v2") 43 | 44 | # Add TF-IDF vectors as well (requires safe_store[tfidf]) 45 | # This will fit TF-IDF on all documents 46 | print("\n--- Adding TF-IDF Vectorization ---") 47 | store.add_vectorization("tfidf:my_analysis") 48 | 49 | # --- 4. Query (read operation, concurrent with WAL) --- 50 | print("\n--- Querying using Sentence Transformer ---") 51 | query_st = "simple storage" 52 | results_st = store.query(query_st, vectorizer_name="st:all-MiniLM-L6-v2", top_k=2) 53 | for i, res in enumerate(results_st): 54 | print(f"ST Result {i+1}: Score={res['similarity']:.4f}, Path='{Path(res['file_path']).name}', Text='{res['chunk_text'][:60]}...'") 55 | 56 | print("\n--- Querying using TF-IDF ---") 57 | query_tfidf = "html index" 58 | results_tfidf = store.query(query_tfidf, vectorizer_name="tfidf:my_analysis", top_k=2) 59 | for i, res in enumerate(results_tfidf): 60 | print(f"TFIDF Result {i+1}: Score={res['similarity']:.4f}, Path='{Path(res['file_path']).name}', Text='{res['chunk_text'][:60]}...'") 61 | 62 | # --- 5. List Methods --- 63 | print("\n--- Listing Vectorization Methods ---") 64 | methods = store.list_vectorization_methods() 65 | for method in methods: 66 | print(f"- ID: {method['method_id']}, Name: {method['method_name']}, Type: {method['method_type']}, Dim: {method['vector_dim']}") 67 | 68 | except safe_store.ConfigurationError as e: 69 | print(f"\n[ERROR] Missing dependency: {e}") 70 | print("Please install the required extras (e.g., pip install safe_store[all])") 71 | except safe_store.ConcurrencyError as e: 72 | print(f"\n[ERROR] Lock timeout or concurrency issue: {e}") 73 | except Exception as e: 74 | print(f"\n[ERROR] An unexpected error occurred: {e}") 75 | finally: 76 | # Connection is closed automatically by the 'with' statement exit 77 | print("\n--- Store context closed ---") 78 | # Cleanup (optional) 79 | # import shutil 80 | # shutil.rmtree(doc_dir) 81 | # Path("my_vector_store.db").unlink(missing_ok=True) 82 | # Path("my_vector_store.db.lock").unlink(missing_ok=True) 83 | 84 | print("\nCheck 'my_vector_store.db' and console logs.") 85 | 86 | -------------------------------------------------------------------------------- /safe_store/vectorization/methods/sentense_transformer/__init__.py: -------------------------------------------------------------------------------- 1 | # safe_store/vectorization/methods/st.py 2 | import numpy as np 3 | from typing import List, Optional, Dict, Any 4 | from safe_store.vectorization.base import BaseVectorizer 5 | from safe_store.core.exceptions import ConfigurationError, VectorizationError 6 | from safe_store.processing.tokenizers import HuggingFaceTokenizerWrapper 7 | from ascii_colors import ASCIIColors, trace_exception 8 | import pipmaster as pm 9 | 10 | class_name="STVectorizer" 11 | 12 | try: 13 | pm.ensure_packages(["torch","torchvision","sentence-transformers"]) 14 | from sentence_transformers import SentenceTransformer 15 | except Exception as e: 16 | trace_exception(e) 17 | SentenceTransformer = None 18 | 19 | 20 | def list_available_models(**kwargs) -> List[str]: 21 | """ 22 | Returns a curated list of popular and effective Sentence Transformer models. 23 | This list is static as querying the Hugging Face Hub dynamically is not practical. 24 | """ 25 | return [ 26 | "all-MiniLM-L6-v2", 27 | "all-mpnet-base-v2", 28 | "multi-qa-mpnet-base-dot-v1", 29 | "all-distilroberta-v1", 30 | "paraphrase-albert-small-v2", 31 | "LaBSE" 32 | ] 33 | 34 | class STVectorizer(BaseVectorizer): 35 | """Vectorizes text using models from the sentence-transformers library.""" 36 | 37 | DEFAULT_MODEL: str = "all-MiniLM-L6-v2" 38 | 39 | def __init__(self, model_config: Dict[str, Any], cache_folder: Optional[str] = None, **kwargs): 40 | super().__init__(vectorizer_name="st") 41 | 42 | if SentenceTransformer is None: 43 | raise ConfigurationError("STVectorizer requires 'sentence-transformers'. Install with: pip install safe_store[sentence-transformers]") 44 | 45 | self.model_name: str = model_config.get("model", self.DEFAULT_MODEL) 46 | if not self.model_name: 47 | raise ConfigurationError("STVectorizer config must include a 'model' key.") 48 | 49 | try: 50 | self.model: SentenceTransformer = SentenceTransformer(self.model_name, cache_folder=cache_folder) 51 | self._dim: int = self.model.get_sentence_embedding_dimension() 52 | self._dtype: np.dtype = np.dtype(np.float32) 53 | ASCIIColors.info(f"Model '{self.model_name}' loaded. Dimension: {self._dim}") 54 | except Exception as e: 55 | raise VectorizationError(f"Failed to load Sentence Transformer model '{self.model_name}': {e}") from e 56 | 57 | def get_tokenizer(self) -> Optional[HuggingFaceTokenizerWrapper]: 58 | """Returns the tokenizer from the loaded SentenceTransformer model, wrapped.""" 59 | if hasattr(self.model, 'tokenizer'): 60 | return HuggingFaceTokenizerWrapper(self.model.tokenizer) 61 | return None 62 | 63 | def vectorize(self, texts: List[str]) -> np.ndarray: 64 | if not texts: 65 | return np.empty((0, self.dim), dtype=self.dtype) 66 | try: 67 | embeddings = self.model.encode(texts, convert_to_numpy=True, show_progress_bar=False) 68 | if not isinstance(embeddings, np.ndarray): 69 | raise VectorizationError("SentenceTransformer model did not return a NumPy array.") 70 | if embeddings.dtype != self._dtype: 71 | embeddings = embeddings.astype(self._dtype) 72 | return embeddings 73 | except Exception as e: 74 | raise VectorizationError(f"Error during sentence-transformer encoding: {e}") from e 75 | 76 | @property 77 | def dim(self) -> int: 78 | return self._dim 79 | 80 | @property 81 | def dtype(self) -> np.dtype: 82 | return self._dtype 83 | 84 | @staticmethod 85 | def list_models(**kwargs) -> List[str]: 86 | """ 87 | Returns a list of popular SentenceTransformer models. 88 | This is not an exhaustive list from an API but a curated selection. 89 | """ 90 | return [ 91 | "all-MiniLM-L6-v2", 92 | "all-mpnet-base-v2", 93 | "paraphrase-multilingual-MiniLM-L12-v2", 94 | "distiluse-base-multilingual-cased-v1", 95 | "all-roberta-large-v1" 96 | ] -------------------------------------------------------------------------------- /examples/dynamic_model_selection.py: -------------------------------------------------------------------------------- 1 | # examples/dynamic_model_selection.py 2 | """ 3 | This example demonstrates how to use the `list_available_models` class method 4 | to dynamically discover and select a model from a running Ollama instance, 5 | and then use it to create and query a SafeStore. 6 | """ 7 | import safe_store 8 | from pathlib import Path 9 | import shutil 10 | 11 | # --- Configuration --- 12 | DB_FILE = "dynamic_ollama_store.db" 13 | # This example assumes an Ollama server is running at the default host. 14 | # If your Ollama server is elsewhere, you can specify it: 15 | # OLLAMA_HOST = "http://192.168.1.10:11434" 16 | OLLAMA_HOST = "http://localhost:11434" 17 | 18 | def cleanup(): 19 | """Removes the database file from previous runs.""" 20 | Path(DB_FILE).unlink(missing_ok=True) 21 | Path(f"{DB_FILE}.lock").unlink(missing_ok=True) 22 | print(f"--- Cleaned up old database file: {DB_FILE} ---") 23 | 24 | if __name__ == "__main__": 25 | cleanup() 26 | 27 | # --- 1. Discover available Ollama models --- 28 | print(f"\n--- Step 1: Discovering models from Ollama at {OLLAMA_HOST} ---") 29 | try: 30 | # Use the class method to get a list of models from the Ollama server 31 | available_models = safe_store.SafeStore.list_available_models( 32 | vectorizer_name="ollama", 33 | host=OLLAMA_HOST # Pass the host to the method 34 | ) 35 | 36 | if not available_models: 37 | print("\n[ERROR] No models found on the Ollama server.") 38 | print("Please make sure Ollama is running and you have pulled at least one model, for example:") 39 | print(" ollama pull nomic-embed-text") 40 | exit() 41 | 42 | print("Found available models:") 43 | for model in available_models: 44 | print(f" - {model}") 45 | 46 | except safe_store.VectorizationError as e: 47 | print(f"\n[ERROR] Could not connect to the Ollama server: {e}") 48 | print("Please ensure your Ollama server is running and accessible.") 49 | exit() 50 | except Exception as e: 51 | print(f"\n[ERROR] An unexpected error occurred: {e}") 52 | exit() 53 | 54 | # --- 2. Select a model and configure the store --- 55 | print("\n--- Step 2: Selecting a model ---") 56 | # For this example, we'll just pick the first model from the list. 57 | # In a real application, you might let the user choose. 58 | selected_model = available_models[0] 59 | print(f"Selected model: {selected_model}") 60 | 61 | # Prepare the configuration for the SafeStore instance 62 | vectorizer_name = "ollama" 63 | vectorizer_config = { 64 | "model": selected_model, 65 | "host": OLLAMA_HOST 66 | } 67 | 68 | # --- 3. Initialize SafeStore with the selected model --- 69 | print("\n--- Step 3: Initializing SafeStore ---") 70 | store = safe_store.SafeStore( 71 | db_path=DB_FILE, 72 | vectorizer_name=vectorizer_name, 73 | vectorizer_config=vectorizer_config, 74 | log_level=safe_store.LogLevel.INFO 75 | ) 76 | print("SafeStore initialized successfully.") 77 | 78 | # --- 4. Use the store to add and query text --- 79 | print("\n--- Step 4: Adding text and querying ---") 80 | with store: 81 | # Add some sample text 82 | store.add_text( 83 | unique_id="tech-report-01", 84 | text="The new quantum processor shows a 200% performance increase in benchmark tests." 85 | ) 86 | store.add_text( 87 | unique_id="finance-summary-01", 88 | text="Quarterly earnings are up by 15%, driven by the new hardware division." 89 | ) 90 | print("Added two text entries to the store.") 91 | 92 | # Perform a query 93 | query_text = "What were the results of the processor benchmarks?" 94 | print(f"\nQuerying for: '{query_text}'") 95 | results = store.query(query_text, top_k=1) 96 | 97 | if results: 98 | result = results[0] 99 | print(f"Found a relevant chunk with {result['similarity_percent']:.2f}% similarity:") 100 | print(f" -> Text: '{result['chunk_text']}'") 101 | else: 102 | print("No relevant results found for the query.") 103 | 104 | print("\n--- Example Finished ---") -------------------------------------------------------------------------------- /examples/custom_logging.py: -------------------------------------------------------------------------------- 1 | # examples/custom_logging.py 2 | """ 3 | Demonstrates how to configure ascii_colors globally to customize 4 | safe_store's logging output (and any other ascii_colors usage). 5 | """ 6 | import safe_store 7 | from ascii_colors import ASCIIColors, LogLevel, FileHandler, Formatter 8 | from pathlib import Path 9 | import shutil 10 | 11 | # --- Configuration --- 12 | DB_FILE = "custom_log_store.db" 13 | LOG_FILE = "safe_store_custom.log" 14 | DOC_DIR = Path("temp_docs_custom_log") 15 | 16 | # --- Helper Functions --- 17 | def print_header(title): 18 | print("\n" + "="*10 + f" {title} " + "="*10) 19 | 20 | def cleanup(): 21 | print_header("Cleaning Up") 22 | db_path = Path(DB_FILE) 23 | log_path = Path(LOG_FILE) 24 | lock_path = Path(f"{DB_FILE}.lock") 25 | wal_path = Path(f"{DB_FILE}-wal") 26 | shm_path = Path(f"{DB_FILE}-shm") 27 | 28 | if DOC_DIR.exists(): shutil.rmtree(DOC_DIR) 29 | if db_path.exists(): db_path.unlink() 30 | if log_path.exists(): log_path.unlink() 31 | if lock_path.exists(): lock_path.unlink(missing_ok=True) 32 | if wal_path.exists(): wal_path.unlink(missing_ok=True) 33 | if shm_path.exists(): shm_path.unlink(missing_ok=True) 34 | print("- Cleanup complete.") 35 | 36 | # --- Main Script --- 37 | if __name__ == "__main__": 38 | cleanup() # Start fresh 39 | 40 | print_header("Configuring Global Logging") 41 | 42 | # 1. Set the global minimum log level (e.g., show DEBUG messages) 43 | ASCIIColors.set_log_level(LogLevel.DEBUG) 44 | print(f"- Global log level set to: {LogLevel.DEBUG.name}") 45 | 46 | # 2. Create a file handler to log messages to a file 47 | file_handler = FileHandler(LOG_FILE, encoding='utf-8') 48 | print(f"- Configured file logging to: {LOG_FILE}") 49 | 50 | # 3. Define a format for the file logger 51 | # Example format: Timestamp - Level Name - Message 52 | file_formatter = Formatter( 53 | "%(asctime)s [%(levelname)-8s] %(message)s", 54 | datefmt="%Y-%m-%d %H:%M:%S" 55 | ) 56 | file_handler.setFormatter(file_formatter) 57 | print(f"- Set custom format for file logger.") 58 | 59 | # 4. Add the configured file handler to ascii_colors 60 | ASCIIColors.add_handler(file_handler) 61 | print(f"- Added file handler globally.") 62 | 63 | # Optional: Remove the default console handler if you *only* want file logging 64 | # default_console_handler = ASCIIColors.get_default_handler() 65 | # if default_console_handler: 66 | # ASCIIColors.remove_handler(default_console_handler) 67 | # print("- Removed default console handler.") 68 | # else: 69 | # print("- Default console handler not found or already removed.") 70 | print("- Default console handler remains active (logs will go to console AND file).") 71 | 72 | 73 | # --- Initialize and use safe_store --- 74 | # It will now use the global logging configuration we just set. 75 | print_header("Initializing and Using safe_store") 76 | print("safe_store actions will now be logged according to the global settings.") 77 | print(f"Check the console output AND the '{LOG_FILE}' file.") 78 | 79 | try: 80 | store = safe_store.SafeStore(DB_FILE) # Uses global log level (DEBUG) 81 | 82 | # Prepare a sample document 83 | DOC_DIR.mkdir(exist_ok=True) 84 | doc_path = DOC_DIR / "logging_test.txt" 85 | doc_path.write_text("This is a test document for custom logging.", encoding='utf-8') 86 | 87 | with store: 88 | # Add the document - DEBUG messages should appear in the log file 89 | store.add_document(doc_path, vectorizer_name="st:all-MiniLM-L6-v2") 90 | 91 | # Perform a query 92 | results = store.query("custom logging test") 93 | print("\n--- Query Results ---") 94 | if results: 95 | print(f"Found {len(results)} result(s).") 96 | else: 97 | print("No results found.") 98 | 99 | except safe_store.ConfigurationError as e: 100 | print(f"\n[ERROR] Missing dependency: {e}") 101 | print("Please install required extras (e.g., pip install safe_store[sentence-transformers])") 102 | except Exception as e: 103 | print(f"\n[ERROR] An unexpected error occurred: {e.__class__.__name__}: {e}") 104 | finally: 105 | print("\n--- End of Script ---") 106 | print(f"Review console output and '{LOG_FILE}' for detailed logs.") 107 | 108 | -------------------------------------------------------------------------------- /safe_store/search/similarity.py: -------------------------------------------------------------------------------- 1 | # safe_store/search/similarity.py 2 | import numpy as np 3 | from ascii_colors import ASCIIColors 4 | from typing import Union 5 | 6 | # Type hint for vectors 7 | VectorInput = Union[np.ndarray, list[float]] # Allow lists as input for query? No, enforce ndarray. 8 | Vector1D = np.ndarray # Shape (D,) 9 | Matrix2D = np.ndarray # Shape (N, D) 10 | 11 | def cosine_similarity(query_vector: Vector1D, vectors: Matrix2D) -> np.ndarray: 12 | """ 13 | Calculates cosine similarity between a single query vector and a matrix of vectors. 14 | 15 | Handles normalization and potential zero vectors gracefully. 16 | 17 | Args: 18 | query_vector: A 1D NumPy array representing the query vector (shape D). 19 | vectors: A 2D NumPy array where each row is a vector to compare against 20 | (shape N, D). Can also handle the case where vectors is 1D 21 | (shape D) representing a single comparison vector, by reshaping it. 22 | 23 | Returns: 24 | A 1D NumPy array of shape (N,) containing the cosine similarity scores, 25 | where each score is between -1.0 and 1.0. 26 | 27 | Raises: 28 | TypeError: If inputs are not NumPy arrays. 29 | ValueError: If input shapes are incompatible (e.g., query is not 1D, 30 | matrix is not 1D or 2D, or dimensions mismatch). 31 | """ 32 | if not isinstance(query_vector, np.ndarray) or not isinstance(vectors, np.ndarray): 33 | raise TypeError("Input query_vector and vectors must be NumPy arrays.") 34 | 35 | # Validate query_vector shape 36 | if query_vector.ndim != 1: 37 | raise ValueError(f"Query vector must be 1D, but got shape {query_vector.shape}") 38 | 39 | # Validate and potentially reshape vectors matrix 40 | if vectors.ndim == 1: 41 | # Allow comparing query to a single vector passed as 1D array 42 | if query_vector.shape[0] == vectors.shape[0]: 43 | vectors = vectors.reshape(1, -1) # Reshape to (1, D) 44 | ASCIIColors.debug("Reshaped 1D input 'vectors' to 2D for single vector comparison.") 45 | else: 46 | raise ValueError( 47 | f"If 'vectors' is 1D, its dimension ({vectors.shape[0]}) must match " 48 | f"query_vector dimension ({query_vector.shape[0]})" 49 | ) 50 | elif vectors.ndim != 2: 51 | raise ValueError(f"Input 'vectors' must be a 1D or 2D array, but got shape {vectors.shape}") 52 | 53 | # Dimension compatibility check 54 | if query_vector.shape[0] != vectors.shape[1]: 55 | raise ValueError( 56 | f"Query vector dimension ({query_vector.shape[0]}) must match " 57 | f"the dimension of vectors in the matrix ({vectors.shape[1]})" 58 | ) 59 | 60 | num_vectors = vectors.shape[0] 61 | if num_vectors == 0: 62 | ASCIIColors.debug("Input 'vectors' matrix is empty, returning empty similarity array.") 63 | return np.array([], dtype=query_vector.dtype) # Return empty array of appropriate type 64 | 65 | ASCIIColors.debug(f"Calculating cosine similarity: query_shape={query_vector.shape}, matrix_shape={vectors.shape}") 66 | 67 | # Calculate norms, adding epsilon for numerical stability and avoiding zero division 68 | epsilon = np.finfo(query_vector.dtype).eps # Use machine epsilon for the data type 69 | query_norm = np.linalg.norm(query_vector) 70 | vectors_norm = np.linalg.norm(vectors, axis=1) # Norm of each row vector 71 | 72 | # Handle potential zero vectors by replacing norm with epsilon 73 | query_norm_safe = query_norm if query_norm > epsilon else epsilon 74 | vectors_norm_safe = np.where(vectors_norm > epsilon, vectors_norm, epsilon) 75 | 76 | # Normalize vectors 77 | # Using np.divide with 'out' and 'where' could be slightly more robust, but direct division is common 78 | norm_query = query_vector / query_norm_safe 79 | # Use broadcasting for matrix normalization: vectors_norm_safe[:, np.newaxis] ensures (N, 1) shape 80 | norm_vectors = vectors / vectors_norm_safe[:, np.newaxis] 81 | 82 | # Calculate dot product between normalized matrix rows and the normalized query vector 83 | # Result is (N, D) dot (D,) -> (N,) 84 | similarity_scores = np.dot(norm_vectors, norm_query) 85 | 86 | # Clip scores to be strictly within [-1, 1] due to potential floating point inaccuracies 87 | similarity_scores = np.clip(similarity_scores, -1.0, 1.0) 88 | 89 | ASCIIColors.debug(f"Similarity calculation complete. Output shape: {similarity_scores.shape}") 90 | return similarity_scores -------------------------------------------------------------------------------- /docs/encryption.rst: -------------------------------------------------------------------------------- 1 | ========== 2 | Encryption 3 | ========== 4 | 5 | ``safe_store`` provides optional encryption at rest for the text content of document chunks stored in the database. This helps protect sensitive information if the database file itself is exposed. 6 | 7 | How it Works 8 | ------------ 9 | 10 | * **Algorithm:** Uses Fernet symmetric authenticated cryptography from the `cryptography `_ library. Fernet uses AES-128 in CBC mode with PKCS7 padding for encryption and HMAC with SHA256 for authentication. 11 | * **Key Derivation:** When you provide an ``encryption_key`` (password) during ``safe_store`` initialization, a strong 256-bit encryption key suitable for Fernet is derived using PBKDF2 HMAC SHA256. 12 | * **Salt:** For simplicity within ``safe_store``, a **fixed, hardcoded salt** is used during key derivation. This means the same password will always produce the same encryption key. See the Security Considerations below. 13 | * **Encryption Target:** Only the ``chunk_text`` stored in the ``chunks`` table is encrypted. Other data like document paths, metadata, vectorizer parameters, and the vectors themselves are **not** encrypted by this feature. 14 | * **Automatic Handling:** Encryption and decryption are handled automatically during ``add_document`` and ``query`` operations if the ``safe_store`` instance was initialized with the correct ``encryption_key``. 15 | 16 | Enabling Encryption 17 | ------------------- 18 | 19 | 1. **Install Dependency:** Ensure the ``cryptography`` library is installed: 20 | .. code-block:: bash 21 | 22 | pip install safe_store[encryption] 23 | # or 24 | pip install safe_store[all] 25 | 26 | 2. **Provide Key on Init:** Pass your chosen password (key) to the ``encryption_key`` parameter when creating the ``safe_store`` instance: 27 | 28 | .. code-block:: python 29 | 30 | import safe_store 31 | 32 | my_password = "your-very-strong-password-here" # Keep this safe! 33 | 34 | store = safe_store.SafeStore( 35 | "encrypted_store.db", 36 | encryption_key=my_password 37 | ) 38 | 39 | # Now, when you add documents, chunk text will be encrypted 40 | with store: 41 | store.add_document("path/to/sensitive_doc.txt") 42 | 43 | # When you query, chunk text will be automatically decrypted 44 | results = store.query("search term") 45 | print(results[0]['chunk_text']) # Prints decrypted text 46 | 47 | Usage Notes 48 | ----------- 49 | 50 | * **Consistency:** You **must** use the exact same ``encryption_key`` every time you open a specific database file that contains encrypted data. 51 | * **Querying without Key:** If you open an encrypted database without providing the key (or with the wrong key), query results will contain placeholder text like ``[Encrypted - Key Unavailable]`` or ``[Encrypted - Decryption Failed]`` instead of the actual chunk text. 52 | * **Adding Vectorizations:** If you use ``add_vectorization`` for a method like TF-IDF that requires fitting on existing text, ``safe_store`` will attempt to decrypt the necessary chunks using the provided key. If the key is missing or incorrect, the operation will fail. 53 | * **Key Management:** **You are solely responsible for managing your ``encryption_key`` securely.** If you lose the key, the encrypted data in the database will be permanently unrecoverable. Do not hardcode keys directly in your source code in production environments. Consider using environment variables, configuration files with appropriate permissions, or dedicated secrets management systems. 54 | 55 | Security Considerations 56 | ----------------------- 57 | 58 | * **Fixed Salt:** As mentioned, ``safe_store`` currently uses a fixed salt for PBKDF2 key derivation for simplicity. This is less secure than using a unique, randomly generated salt for each password/database, as it doesn't fully protect against precomputed rainbow table attacks if the fixed salt becomes known. For high-security requirements, this implementation might not be sufficient. 59 | * **Metadata Not Encrypted:** Document paths, metadata, and vector information remain unencrypted. Ensure no sensitive information is placed in document metadata if the database file requires protection. 60 | * **Scope:** Encryption only applies to chunk text *at rest* in the SQLite file. Data is decrypted in memory during processing (e.g., querying). 61 | 62 | This feature provides a reasonable layer of protection against casual inspection of the database file but relies heavily on the security of your chosen ``encryption_key`` and understanding its limitations. 63 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # UV 98 | # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | #uv.lock 102 | 103 | # poetry 104 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 105 | # This is especially recommended for binary packages to ensure reproducibility, and is more 106 | # commonly ignored for libraries. 107 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 108 | #poetry.lock 109 | 110 | # pdm 111 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 112 | #pdm.lock 113 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 114 | # in version control. 115 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 116 | .pdm.toml 117 | .pdm-python 118 | .pdm-build/ 119 | 120 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 121 | __pypackages__/ 122 | 123 | # Celery stuff 124 | celerybeat-schedule 125 | celerybeat.pid 126 | 127 | # SageMath parsed files 128 | *.sage.py 129 | 130 | # Environments 131 | .env 132 | .venv 133 | env/ 134 | venv/ 135 | ENV/ 136 | env.bak/ 137 | venv.bak/ 138 | 139 | # Spyder project settings 140 | .spyderproject 141 | .spyproject 142 | 143 | # Rope project settings 144 | .ropeproject 145 | 146 | # mkdocs documentation 147 | /site 148 | 149 | # mypy 150 | .mypy_cache/ 151 | .dmypy.json 152 | dmypy.json 153 | 154 | # Pyre type checker 155 | .pyre/ 156 | 157 | # pytype static type analyzer 158 | .pytype/ 159 | 160 | # Cython debug symbols 161 | cython_debug/ 162 | 163 | # PyCharm 164 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 165 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 166 | # and can be added to the global gitignore or merged into this file. For a more nuclear 167 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 168 | #.idea/ 169 | 170 | # Ruff stuff: 171 | .ruff_cache/ 172 | 173 | # PyPI configuration file 174 | .pypirc 175 | config.toml 176 | 177 | .vscode 178 | config.toml 179 | temp_uploaded_files_webui 180 | webui_safestore_docs 181 | temp_docs_basic 182 | temp_docs_graph_example 183 | 184 | *.db 185 | *.db-shm 186 | *.db-wal 187 | *.lock 188 | 189 | 190 | benchmark_output 191 | # Lollms vscoder 192 | .lollms -------------------------------------------------------------------------------- /tests/test_store_phase4.py: -------------------------------------------------------------------------------- 1 | # tests/test_store_phase4.py 2 | import pytest 3 | import sqlite3 4 | from pathlib import Path 5 | from unittest.mock import patch, MagicMock, call 6 | import re 7 | 8 | # Import exceptions and modules 9 | from safe_store import store as safe_store_store_module 10 | from safe_store import SafeStore, LogLevel 11 | from safe_store.core import db 12 | from safe_store.core.exceptions import ConcurrencyError, FileHandlingError, ConfigurationError, SafeStoreError 13 | 14 | # Import filelock components 15 | from filelock import Timeout, FileLock 16 | 17 | # --- FIX: Define availability check locally --- 18 | # Check for parsing libraries availability within this module 19 | try: 20 | import pypdf 21 | import docx 22 | import bs4 23 | import lxml 24 | PARSING_LIBS_AVAILABLE = True 25 | except ImportError: 26 | PARSING_LIBS_AVAILABLE = False 27 | 28 | # Helper for log checks 29 | def assert_log_call_containing(mock_logger, expected_substring): 30 | """Checks if any call to the mock logger contained the substring.""" 31 | found = False 32 | # Check positional args 33 | for call_args in mock_logger.call_args_list: 34 | args, kwargs = call_args 35 | if args and isinstance(args[0], str) and expected_substring in args[0]: 36 | found = True 37 | break 38 | # Check method calls if not found in direct calls (needed for specific methods like info, debug) 39 | if not found: 40 | for method_call in mock_logger.method_calls: 41 | call_name, args, kwargs = method_call 42 | if args and isinstance(args[0], str) and expected_substring in args[0]: 43 | found = True 44 | break 45 | assert found, f"Expected log call containing '{expected_substring}' not found in {mock_logger.call_args_list} or {mock_logger.method_calls}" 46 | 47 | # +++ FIX: Added mock_db_colors to decorator and arguments +++ 48 | @patch('safe_store.core.db.ASCIIColors') 49 | @patch('safe_store.store.ASCIIColors') 50 | @patch('safe_store.vectorization.manager.ASCIIColors') 51 | def test_store_close_and_context_manager(mock_manager_colors, mock_store_colors, mock_db_colors, temp_db_path): 52 | """Test explicit close() and context manager usage.""" 53 | # mock_db_colors is now available 54 | store = SafeStore(temp_db_path, log_level=LogLevel.DEBUG) 55 | assert store.conn is not None 56 | assert not store._is_closed 57 | # Check initial connection log 58 | assert_log_call_containing(mock_db_colors.debug, "Connected to database:") 59 | 60 | # --- Ensure cache is populated before first close --- 61 | try: 62 | with store: 63 | _ = store.vectorizer_manager.get_vectorizer(store.DEFAULT_VECTORIZER, store.conn, None) 64 | except Exception as e: 65 | if not isinstance(e, ConfigurationError): 66 | print(f"Warning: Error populating cache in test: {e}") 67 | pass 68 | 69 | store.close() 70 | assert store.conn is None 71 | assert store._is_closed 72 | assert_log_call_containing(mock_store_colors.info, "safe_store connection closed.") 73 | try: 74 | assert_log_call_containing(mock_manager_colors.debug, "Cleared vectorizer manager cache") 75 | except AssertionError: 76 | print("Cache clear log not found, cache might have been empty.") 77 | 78 | 79 | # Test context manager re-opening and closing 80 | mock_store_colors.reset_mock() 81 | mock_manager_colors.reset_mock() 82 | mock_db_colors.reset_mock() # Reset this mock too 83 | with SafeStore(temp_db_path, log_level=LogLevel.DEBUG) as store_ctx: 84 | assert store_ctx.conn is not None 85 | assert not store_ctx._is_closed 86 | # +++ FIX: Check the correct mock for the connection log +++ 87 | assert_log_call_containing(mock_db_colors.debug, "Connected to database:") 88 | 89 | # --- Ensure cache is populated before second close (via exit) --- 90 | try: 91 | _ = store_ctx.vectorizer_manager.get_vectorizer(store.DEFAULT_VECTORIZER, store_ctx.conn, None) 92 | except Exception as e: 93 | if not isinstance(e, ConfigurationError): 94 | print(f"Warning: Error populating cache in test context: {e}") 95 | pass 96 | 97 | # Check logs after exiting context 98 | assert store_ctx.conn is None 99 | assert store_ctx._is_closed 100 | assert_log_call_containing(mock_store_colors.debug, "safe_store context closed cleanly.") 101 | try: 102 | assert_log_call_containing(mock_manager_colors.debug, "Cleared vectorizer manager cache") 103 | except AssertionError: 104 | print("Cache clear log not found after context exit, cache might have been empty.") 105 | 106 | 107 | def test_list_documents_empty(safe_store_instance: SafeStore): 108 | """Test listing documents from an empty store.""" 109 | with safe_store_instance as store: 110 | docs = store.list_documents() 111 | assert docs == [] 112 | 113 | def test_list_vectorization_methods_empty(safe_store_instance: SafeStore): 114 | """Test listing methods from an empty store.""" 115 | with safe_store_instance as store: 116 | methods = store.list_vectorization_methods() 117 | assert methods == [] 118 | 119 | 120 | def test_list_documents_populated(populated_store: SafeStore): 121 | """Test listing documents after adding some.""" 122 | with populated_store as store: 123 | docs = store.list_documents() 124 | 125 | assert len(docs) == 2 126 | doc1_info = next((d for d in docs if "sample.txt" in d["file_path"]), None) 127 | doc2_info = next((d for d in docs if "sample2.txt" in d["file_path"]), None) 128 | assert doc1_info is not None; assert doc2_info is not None 129 | assert doc1_info["doc_id"] is not None; assert isinstance(doc1_info["file_path"], str) 130 | assert doc1_info["file_hash"] is not None; assert doc1_info["added_timestamp"] is not None 131 | assert doc1_info["metadata"] is None 132 | 133 | 134 | def test_list_vectorization_methods_populated(populated_store: SafeStore): 135 | """Test listing methods after adding documents.""" 136 | with populated_store as store: 137 | methods = store.list_vectorization_methods() 138 | 139 | st_method = next((m for m in methods if m["method_name"] == store.DEFAULT_VECTORIZER), None) 140 | assert st_method is not None, f"Default vectorizer {store.DEFAULT_VECTORIZER} not found in listed methods." 141 | 142 | assert st_method["method_type"] == "sentence_transformer" 143 | assert st_method["vector_dim"] == 384 # Mocked or real dimension 144 | assert st_method["vector_dtype"] == "float32" 145 | assert st_method["params"] == {}, f"Expected params to be {{}}, got {st_method['params']}" 146 | assert len(methods) == 1, f"Expected 1 method, found {len(methods)}" -------------------------------------------------------------------------------- /safe_store/vectorization/manager.py: -------------------------------------------------------------------------------- 1 | # safe_store/vectorization/manager.py 2 | import json 3 | import yaml 4 | import pipmaster as pm 5 | from typing import Tuple, Optional, Dict, Any, List 6 | from pathlib import Path 7 | 8 | from ..core.exceptions import ConfigurationError, VectorizationError 9 | from .base import BaseVectorizer 10 | from ascii_colors import ASCIIColors 11 | from .utils import load_vectorizer_module 12 | 13 | class VectorizationManager: 14 | """ 15 | Manages and creates vectorizer instances from built-in or custom locations. 16 | Also provides methods to discover available vectorizers and their configurations. 17 | """ 18 | 19 | def __init__(self, cache_folder: Optional[str] = None, custom_vectorizers_path: Optional[str] = None): 20 | pm.ensure_packages(["PyYAML"]) 21 | self.cache_folder = Path(cache_folder) if cache_folder else None 22 | if self.cache_folder: 23 | self.cache_folder.mkdir(parents=True, exist_ok=True) 24 | 25 | self.custom_vectorizers_path = custom_vectorizers_path 26 | self._cache: Dict[str, BaseVectorizer] = {} 27 | 28 | @staticmethod 29 | def _create_unique_name(vectorizer_name: str, config: Optional[Dict[str, Any]]) -> str: 30 | if not config: 31 | return vectorizer_name 32 | config_str = json.dumps(config, sort_keys=True, separators=(',', ':')) 33 | return f"{vectorizer_name}:{config_str}" 34 | 35 | 36 | @staticmethod 37 | def _create_vectorizer_ascii_infos(vectorizer_name: str, config: Optional[Dict[str, Any]]) -> str: 38 | lines = [] 39 | 40 | lines.append(ASCIIColors.bold(ASCIIColors.magenta("════════════════════════════════════════"), emit=False)) 41 | lines.append(ASCIIColors.bold(ASCIIColors.magenta(" VECTORISER INFORMATION"), emit=False)) 42 | lines.append(ASCIIColors.bold(ASCIIColors.magenta("════════════════════════════════════════", emit=False), emit=False)) 43 | lines.append("") 44 | 45 | lines.append( 46 | f"{ASCIIColors.cyan('Name')} : " 47 | f"{ASCIIColors.bold(ASCIIColors.green(vectorizer_name, emit=False),emit=False)}" 48 | ) 49 | 50 | if config: 51 | lines.append("") 52 | lines.append(ASCIIColors.yellow("Configuration:")) 53 | lines.append(ASCIIColors.orange("──────────────",emit=False)) 54 | 55 | pretty_config = json.dumps(config, indent=2, sort_keys=True) 56 | for line in pretty_config.splitlines(): 57 | lines.append(ASCIIColors.blue(line,emit=False)) 58 | else: 59 | lines.append("") 60 | lines.append(ASCIIColors.red("No configuration provided.",emit=False)) 61 | 62 | lines.append("") 63 | lines.append(ASCIIColors.bold(ASCIIColors.magenta("════════════════════════════════════════",emit=False),emit=False)) 64 | 65 | return "\n".join(lines) 66 | 67 | def list_vectorizers(self) -> List[Dict[str, Any]]: 68 | """Scans for available vectorizers and returns their metadata from description.yaml.""" 69 | vectorizers = [] 70 | 71 | # Scan built-in methods directory 72 | methods_path = Path(__file__).parent / "methods" 73 | for p in methods_path.iterdir(): 74 | if p.is_dir() and (p / "description.yaml").exists(): 75 | with open(p / "description.yaml", 'r', encoding='utf-8') as f: 76 | try: 77 | data = yaml.safe_load(f) 78 | data['name'] = p.name # Add the folder name as the identifier 79 | vectorizers.append(data) 80 | except yaml.YAMLError: 81 | ASCIIColors.warning(f"Could not parse description.yaml for vectorizer '{p.name}'") 82 | 83 | # Scan custom path if provided 84 | if self.custom_vectorizers_path: 85 | custom_path = Path(self.custom_vectorizers_path) 86 | if custom_path.is_dir(): 87 | for p in custom_path.iterdir(): 88 | if p.is_dir() and (p / "description.yaml").exists(): 89 | with open(p / "description.yaml", 'r', encoding='utf-8') as f: 90 | try: 91 | data = yaml.safe_load(f) 92 | data['name'] = p.name 93 | data['is_custom'] = True 94 | vectorizers.append(data) 95 | except yaml.YAMLError: 96 | ASCIIColors.warning(f"Could not parse description.yaml for custom vectorizer '{p.name}'") 97 | 98 | return vectorizers 99 | 100 | def get_vectorizer( 101 | self, 102 | vectorizer_name: str, 103 | vectorizer_config: Optional[Dict[str, Any]], 104 | ) -> BaseVectorizer: 105 | # Fix: Add an alias for 'st' to point to the correct folder. 106 | # Note: The folder 'sentense_transformer' has a typo and should ideally be 'sentence_transformer'. 107 | if vectorizer_name == "st": 108 | vectorizer_name = "sentense_transformer" 109 | 110 | unique_name = self._create_unique_name(vectorizer_name, vectorizer_config) 111 | 112 | if unique_name in self._cache: 113 | return self._cache[unique_name] 114 | 115 | ASCIIColors.info(f"Initializing vectorizer:\n{self._create_vectorizer_ascii_infos(vectorizer_name, vectorizer_config)}") 116 | config_for_init = vectorizer_config or {} 117 | 118 | try: 119 | module = load_vectorizer_module(vectorizer_name, self.custom_vectorizers_path) 120 | 121 | # The class name is now fetched from the module itself 122 | if not hasattr(module, 'class_name'): 123 | raise ConfigurationError(f"Vectorizer module '{vectorizer_name}' does not define a 'class_name' variable.") 124 | 125 | VectorizerClass = getattr(module, module.class_name) 126 | 127 | if not issubclass(VectorizerClass, BaseVectorizer): 128 | raise ConfigurationError(f"Class '{module.class_name}' does not inherit from BaseVectorizer.") 129 | 130 | vectorizer_instance = VectorizerClass(model_config=config_for_init, cache_folder=self.cache_folder) 131 | 132 | except (ImportError, FileNotFoundError) as e: 133 | raise ConfigurationError(f"Could not find or load vectorizer module for '{vectorizer_name}'.") from e 134 | except Exception as e: 135 | raise VectorizationError(f"Failed to initialize '{vectorizer_name}' vectorizer: {e}") from e 136 | 137 | self._cache[unique_name] = vectorizer_instance 138 | return vectorizer_instance 139 | 140 | def clear_cache(self) -> None: 141 | self._cache.clear() -------------------------------------------------------------------------------- /safe_store/vectorization/methods/ollama/__init__.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from typing import List, Optional, Dict, Any 3 | from ...base import BaseVectorizer 4 | from ....core.exceptions import ConfigurationError, VectorizationError 5 | from ascii_colors import ASCIIColors, trace_exception 6 | 7 | # each vectorizer must have a class name variable to be identified 8 | class_name="OllamaVectorizer" 9 | 10 | # Attempt import of ollama and related error types, handle gracefully 11 | try: 12 | import pipmaster as pm 13 | pm.ensure_packages(["ollama"]) 14 | import ollama 15 | _OllamaResponseError = ollama.ResponseError 16 | _OllamaRequestError = ollama.RequestError 17 | _OLLAMA_AVAILABLE = True 18 | except ImportError: 19 | _OLLAMA_AVAILABLE = False 20 | ollama = None 21 | class _OllamaResponseError(Exception): pass 22 | class _OllamaRequestError(Exception): pass 23 | 24 | def list_available_models(**kwargs) -> List[str]: 25 | """Dynamically lists models from a running Ollama server.""" 26 | if not _OLLAMA_AVAILABLE: 27 | raise ConfigurationError("Ollama support is not installed. Please run: pip install safe_store[ollama]") 28 | 29 | try: 30 | response = ollama.list() 31 | # The structure from ollama.list() is a dict with a 'models' key 32 | # which is a list of dicts, each with a 'name' key. 33 | return [model.model for model in response.models] 34 | except ollama.RequestError as e: 35 | raise VectorizationError("Could not connect to the Ollama server. Please ensure it is running.") from e 36 | except Exception as e: 37 | raise VectorizationError(f"An unexpected error occurred while listing Ollama models: {e}") from e 38 | 39 | class OllamaVectorizer(BaseVectorizer): 40 | """ 41 | Vectorizes text using models hosted by an Ollama instance. 42 | Requires the `ollama` Python library to be installed. 43 | """ 44 | 45 | def __init__(self, 46 | model_config: Dict[str, Any], 47 | **kwargs): 48 | """ 49 | Initializes the OllamaVectorizer. 50 | 51 | Args: 52 | model_config: A dictionary containing the vectorizer's configuration. 53 | - "model" (str): Mandatory. The name of the model to use. 54 | - "host" (str): Optional. The URL of the Ollama server. 55 | Defaults to http://localhost:11434 or OLLAMA_HOST env var. 56 | 57 | Raises: 58 | ConfigurationError: If 'ollama' is not installed or config is invalid. 59 | VectorizationError: If connection to Ollama fails or the model is invalid. 60 | """ 61 | super().__init__(vectorizer_name="ollama") 62 | if not _OLLAMA_AVAILABLE or ollama is None: 63 | raise ConfigurationError("OllamaVectorizer requires the 'ollama' library. Install with: pip install safe_store[ollama]") 64 | 65 | if not isinstance(model_config, dict) or "model" not in model_config: 66 | raise ConfigurationError("Ollama vectorizer config must be a dictionary with a 'model' key.") 67 | 68 | self.model_name: str = model_config["model"] 69 | self.host: Optional[str] = model_config.get("host") # Let the client handle default 70 | 71 | ASCIIColors.info(f"Initializing Ollama client. Model: {self.model_name}, Host: {self.host or 'default'}") 72 | try: 73 | self.client: ollama.Client = ollama.Client(host=self.host.strip()) 74 | 75 | # Test connection and get embedding dimension 76 | test_prompt = "hello world" 77 | response = self.client.embeddings(model=self.model_name, prompt=test_prompt) 78 | 79 | embedding = response.get("embedding") 80 | 81 | if not isinstance(embedding, list) or not embedding: 82 | raise VectorizationError(f"Ollama model '{self.model_name}' did not return a valid embedding.") 83 | 84 | self._dim = len(embedding) 85 | if self._dim == 0: 86 | raise VectorizationError(f"Ollama model '{self.model_name}' returned a zero-dimension embedding.") 87 | 88 | self._dtype = np.dtype(np.float32) 89 | ASCIIColors.info(f"Ollama model '{self.model_name}' ready. Dimension: {self._dim}") 90 | 91 | except _OllamaResponseError as e: 92 | trace_exception(e) 93 | raise VectorizationError(f"Ollama API error for model '{self.model_name}': {e.error}") from e 94 | except _OllamaRequestError as e: 95 | trace_exception(e) 96 | raise VectorizationError(f"Ollama request error connecting to host '{self.host or 'default'}': {e}") from e 97 | except Exception as e: 98 | trace_exception(e) 99 | raise VectorizationError(f"Failed to initialize Ollama vectorizer '{self.model_name}': {e}") from e 100 | 101 | def vectorize(self, texts: List[str]) -> np.ndarray: 102 | if not texts: 103 | return np.empty((0, self.dim), dtype=self.dtype) 104 | 105 | embeddings_list = [] 106 | try: 107 | for i, text in enumerate(texts): 108 | if not text.strip(): 109 | embeddings_list.append(np.zeros(self.dim, dtype=self._dtype)) 110 | continue 111 | 112 | response = self.client.embeddings(model=self.model_name, prompt=text) 113 | embedding_vector = response.get("embedding") 114 | 115 | if not isinstance(embedding_vector, list) or len(embedding_vector) != self.dim: 116 | raise VectorizationError(f"Ollama model '{self.model_name}' returned an invalid embedding for text at index {i}.") 117 | embeddings_list.append(embedding_vector) 118 | 119 | embeddings_array = np.array(embeddings_list, dtype=self._dtype) 120 | 121 | if embeddings_array.shape != (len(texts), self.dim): 122 | raise VectorizationError(f"Ollama vectorization resulted in unexpected shape {embeddings_array.shape}.") 123 | 124 | return embeddings_array 125 | 126 | except (_OllamaResponseError, _OllamaRequestError) as e: 127 | raise VectorizationError(f"Ollama API error during vectorization: {e}") from e 128 | except Exception as e: 129 | raise VectorizationError(f"Unexpected error during Ollama vectorization: {e}") from e 130 | 131 | @property 132 | def dim(self) -> int: 133 | return self._dim 134 | 135 | @property 136 | def dtype(self) -> np.dtype: 137 | return self._dtype 138 | 139 | @staticmethod 140 | def list_models(**kwargs) -> List[str]: 141 | """Lists available models from the running Ollama instance.""" 142 | try: 143 | response = ollama.list() 144 | # The structure from ollama.list() is a dict with a 'models' key 145 | # which is a list of dicts, each with a 'name' key. 146 | return [model.model for model in response.models] 147 | except ollama.RequestError as e: 148 | raise VectorizationError("Could not connect to the Ollama server. Please ensure it is running.") from e 149 | except Exception as e: 150 | raise VectorizationError(f"An unexpected error occurred while listing Ollama models: {e}") from e 151 | -------------------------------------------------------------------------------- /examples/metadata_generation.py: -------------------------------------------------------------------------------- 1 | # examples/metadata_generation.py 2 | import safe_store 3 | from safe_store import LogLevel 4 | import pipmaster as pm 5 | from pathlib import Path 6 | import shutil 7 | import json 8 | 9 | # --- Configuration --- 10 | DB_FILE = "metadata_example.db" 11 | DOC_DIR = Path("temp_docs_metadata_example") 12 | ENCRYPTION_KEY = "my-super-secret-key-for-testing" # Use a strong key in production 13 | 14 | 15 | BINDING_NAME = "ollama" 16 | HOST_ADDRESS = "http://localhost:11434" 17 | MODEL_NAME = "mistral:latest" 18 | 19 | # --- Example Setup --- 20 | def setup(): 21 | """Cleans up old files and creates new ones for the example.""" 22 | print_header("Setting Up Example Environment") 23 | # Clean up DB 24 | db_path = Path(DB_FILE) 25 | paths_to_delete = [db_path, Path(f"{db_path}.lock")] 26 | for p in paths_to_delete: 27 | p.unlink(missing_ok=True) 28 | 29 | # Clean up and create doc directory 30 | if DOC_DIR.exists(): 31 | shutil.rmtree(DOC_DIR) 32 | DOC_DIR.mkdir(exist_ok=True) 33 | 34 | # Create a sample document 35 | article_content = """ 36 | The Art of Quantum Computing: A Gentle Introduction 37 | 38 | Quantum computing represents a fundamental shift in computation. Unlike classical 39 | computers that use bits (0s and 1s), quantum computers use qubits, which can 40 | exist in a superposition of both 0 and 1 simultaneously. This property, along 41 | with entanglement, allows quantum computers to explore a vast number of 42 | possibilities at once, promising to solve complex problems in fields like 43 | medicine, materials science, and cryptography that are intractable for even the 44 | most powerful classical supercomputers. However, building and controlling stable 45 | qubits remains a significant engineering challenge due to their sensitivity to 46 | environmental noise. 47 | """ 48 | (DOC_DIR / "quantum_intro.txt").write_text(article_content.strip()) 49 | print(f"- Created sample document in: {DOC_DIR.resolve()}") 50 | return DOC_DIR / "quantum_intro.txt" 51 | 52 | def print_header(title: str): 53 | print("\n" + "="*20 + f" {title} " + "="*20) 54 | 55 | # --- Metadata Generation with Lollms --- 56 | def generate_metadata_with_lollms(file_content: str) -> dict: 57 | """ 58 | Uses lollms-client to generate a title and summary for the given text. 59 | """ 60 | print_header("Generating Metadata with LOLLMS") 61 | try: 62 | pm.ensure_packages(["lollms_client"]) 63 | from lollms_client import LollmsClient 64 | # Make sure you have a lollms-webui instance running with a model loaded. 65 | # This example assumes a local instance at the default port. 66 | client = LollmsClient(llm_binding_name=BINDING_NAME, llm_binding_config={"host_address": HOST_ADDRESS, "model_name": MODEL_NAME}) 67 | except Exception as e: 68 | print(f" [SKIP] Could not initialize LollmsClient. Is it installed and running? Error: {e}") 69 | return {"error": "LollmsClient not available"} 70 | 71 | prompt = f""" 72 | Analyze the following document and extract a concise title and a one-sentence summary. 73 | Your response MUST be in a raw JSON format with "title" and "summary" as keys. 74 | 75 | Document: 76 | --- 77 | {file_content} 78 | --- 79 | 80 | JSON Response: 81 | """ 82 | 83 | print(" - Sending prompt to LLM for metadata extraction...") 84 | try: 85 | response = client.generate_text(prompt, max_new_tokens=150) 86 | print(" - Received response from LLM.") 87 | # The response should be a JSON string, let's parse it 88 | metadata = json.loads(response) 89 | print(f" - Successfully parsed metadata: {metadata}") 90 | return metadata 91 | except Exception as e: 92 | print(f" [ERROR] Failed to generate or parse metadata from LLM. Error: {e}") 93 | return {"error": f"LLM metadata generation failed: {e}"} 94 | 95 | # --- Main Script --- 96 | if __name__ == "__main__": 97 | sample_doc_path = setup() 98 | 99 | # 1. Generate Metadata 100 | document_text = sample_doc_path.read_text() 101 | generated_metadata = generate_metadata_with_lollms(document_text) 102 | 103 | if "error" in generated_metadata: 104 | print("\n Proceeding with fallback metadata.") 105 | generated_metadata = { 106 | "title": "Fallback Title: Quantum Computing", 107 | "summary": "A fallback summary about qubits and their challenges." 108 | } 109 | 110 | # 2. Initialize SafeStore with Encryption 111 | print_header("Initializing SafeStore with Encryption") 112 | try: 113 | # Note: We are now passing the encryption key 114 | store = safe_store.SafeStore( 115 | db_path=DB_FILE, 116 | vectorizer_name="st", 117 | vectorizer_config={"model": "all-MiniLM-L6-v2"}, 118 | log_level=LogLevel.INFO, 119 | encryption_key=ENCRYPTION_KEY 120 | ) 121 | print(" - SafeStore initialized.") 122 | except Exception as e: 123 | print(f" [FATAL] Could not initialize SafeStore: {e}") 124 | exit(1) 125 | 126 | # 3. Add the document WITH the generated metadata 127 | print_header("Adding Document with Generated Metadata") 128 | with store: 129 | store.add_document( 130 | file_path=sample_doc_path, 131 | metadata=generated_metadata 132 | ) 133 | print(f" - Document '{sample_doc_path.name}' added to the store.") 134 | 135 | # 4. List documents to verify metadata storage (and encryption) 136 | print("\n --- Verifying Stored Documents ---") 137 | docs = store.list_documents() 138 | for doc in docs: 139 | print(f" - Found Doc ID: {doc['doc_id']}, Path: {doc['file_path']}") 140 | print(f" Metadata: {doc['metadata']}") 141 | 142 | # 5. Query the store and inspect the results 143 | print_header("Querying the Store") 144 | query = "What are the difficulties of building qubits?" 145 | print(f" - Query: '{query}'") 146 | 147 | with store: 148 | results = store.query(query, top_k=1) 149 | 150 | if not results: 151 | print(" - No results found.") 152 | else: 153 | top_result = results[0] 154 | print("\n --- Top Query Result ---") 155 | print(f" - Similarity: {top_result['similarity_percent']:.2f}%") 156 | 157 | print("\n - Document Metadata (from result object):") 158 | print(f" {top_result['document_metadata']}") 159 | 160 | print("\n - Full Chunk Text (with prepended context):") 161 | print("-" * 50) 162 | print(top_result['chunk_text']) 163 | print("-" * 50) 164 | 165 | # Verification 166 | assert "Document Context" in top_result['chunk_text'] 167 | assert generated_metadata['title'] in top_result['chunk_text'] 168 | print("\n [SUCCESS] Verified that document metadata was prepended to the chunk text.") 169 | 170 | # 6. Final cleanup 171 | print_header("Final Cleanup") 172 | if DOC_DIR.exists(): 173 | shutil.rmtree(DOC_DIR) 174 | print(f"- Removed temporary directory: {DOC_DIR}") 175 | 176 | print("\n--- Example Finished ---") -------------------------------------------------------------------------------- /safe_store/security/encryption.py: -------------------------------------------------------------------------------- 1 | # safe_store/security/encryption.py 2 | import base64 3 | from typing import Optional, Tuple 4 | 5 | from ascii_colors import ASCIIColors 6 | from ..core.exceptions import EncryptionError, ConfigurationError 7 | 8 | # Attempt import 9 | try: 10 | from cryptography.fernet import Fernet, InvalidToken 11 | from cryptography.hazmat.primitives import hashes 12 | from cryptography.hazmat.primitives.kdf.pbkdf2 import PBKDF2HMAC 13 | CRYPTOGRAPHY_AVAILABLE = True 14 | except ImportError: 15 | CRYPTOGRAPHY_AVAILABLE = False 16 | Fernet, InvalidToken = None, None 17 | PBKDF2HMAC, hashes = None, None 18 | 19 | 20 | SALT_SIZE = 16 # Standard salt size for PBKDF2 21 | # Recommended PBKDF2 iterations (adjust based on security needs vs performance) 22 | # OWASP recommendation as of 2023 is >= 600,000 for SHA256 23 | PBKDF2_ITERATIONS = 600_000 24 | 25 | class Encryptor: 26 | """ 27 | Handles symmetric encryption and decryption using Fernet (AES-128-CBC + HMAC). 28 | 29 | Derives a valid Fernet key from a user-provided password using PBKDF2. 30 | """ 31 | 32 | def __init__(self, password: Optional[str]): 33 | """ 34 | Initializes the Encryptor. 35 | 36 | Args: 37 | password: The password to use for encryption/decryption. If None, 38 | encryption/decryption methods will raise errors. 39 | 40 | Raises: 41 | ConfigurationError: If 'cryptography' is not installed. 42 | """ 43 | if not CRYPTOGRAPHY_AVAILABLE: 44 | msg = "Encryption features require 'cryptography'. Install with: pip install safe_store[encryption]" 45 | ASCIIColors.error(msg) 46 | raise ConfigurationError(msg) 47 | 48 | if password is None: 49 | self._fernet = None 50 | ASCIIColors.debug("Encryptor initialized without a password. Encryption/decryption disabled.") 51 | else: 52 | if not isinstance(password, str) or not password: 53 | raise ValueError("Encryption password must be a non-empty string.") 54 | # Note: Storing the key directly is generally discouraged in production. 55 | # This simple implementation derives the key on init. 56 | # A more robust system might derive it on demand or use a dedicated key management system. 57 | key = self._derive_key(password) 58 | self._fernet = Fernet(key) 59 | ASCIIColors.debug("Encryptor initialized with password-derived key.") 60 | 61 | @staticmethod 62 | def _derive_key(password: str, salt: Optional[bytes] = None) -> bytes: 63 | """ 64 | Derives a 32-byte key suitable for Fernet using PBKDF2HMAC-SHA256. 65 | 66 | A fixed salt is used here for simplicity, allowing the same password 67 | to always produce the same key. **WARNING:** In a real-world scenario, 68 | you'd typically generate a *unique* salt per encryption and store it 69 | alongside the ciphertext. However, for this use case (encrypting chunks 70 | all potentially decrypted with the same store instance/password), using a 71 | fixed derivative might be acceptable, though less ideal than per-chunk salts. 72 | Using a hardcoded salt makes it slightly less secure than generating one. 73 | Let's stick to a hardcoded one for simplicity of this library's scope, 74 | but document this limitation heavily. 75 | 76 | Args: 77 | password: The user-provided password string. 78 | salt: Optional salt (not used in this fixed-salt implementation). 79 | 80 | Returns: 81 | A URL-safe base64-encoded 32-byte key. 82 | """ 83 | # --- !!! SECURITY WARNING !!! --- 84 | # Using a hardcoded salt is NOT best practice for general encryption. 85 | # It means the same password will always yield the same key, reducing 86 | # protection against rainbow table attacks compared to unique salts. 87 | # This is a simplification for this specific library context. 88 | # Consider generating and storing salts if higher security is needed. 89 | hardcoded_salt = b'safe_store_salt_' # 16 bytes 90 | 91 | if PBKDF2HMAC is None or hashes is None: 92 | # Should be caught by __init__ check, but defensive coding 93 | raise ConfigurationError("Cryptography library components missing for key derivation.") 94 | 95 | kdf = PBKDF2HMAC( 96 | algorithm=hashes.SHA256(), 97 | length=32, # Fernet keys are 32 bytes 98 | salt=hardcoded_salt, 99 | iterations=PBKDF2_ITERATIONS, 100 | ) 101 | key = base64.urlsafe_b64encode(kdf.derive(password.encode('utf-8'))) 102 | return key 103 | 104 | @property 105 | def is_enabled(self) -> bool: 106 | """Returns True if encryption is configured (password provided).""" 107 | return self._fernet is not None 108 | 109 | def encrypt(self, data: str) -> bytes: 110 | """ 111 | Encrypts string data. 112 | 113 | Args: 114 | data: The plaintext string to encrypt. 115 | 116 | Returns: 117 | The encrypted data as bytes (Fernet token). 118 | 119 | Raises: 120 | EncryptionError: If encryption is not enabled or fails. 121 | TypeError: If input data is not a string. 122 | """ 123 | if not self.is_enabled or self._fernet is None: 124 | raise EncryptionError("Encryption is not enabled (no password provided).") 125 | if not isinstance(data, str): 126 | raise TypeError("Data to encrypt must be a string.") 127 | 128 | try: 129 | encrypted_data = self._fernet.encrypt(data.encode('utf-8')) 130 | return encrypted_data 131 | except Exception as e: 132 | msg = f"Encryption failed: {e}" 133 | ASCIIColors.error(msg, exc_info=True) 134 | raise EncryptionError(msg) from e 135 | 136 | def decrypt(self, token: bytes) -> str: 137 | """ 138 | Decrypts a Fernet token back into a string. 139 | 140 | Args: 141 | token: The encrypted data (Fernet token) as bytes. 142 | 143 | Returns: 144 | The decrypted plaintext string. 145 | 146 | Raises: 147 | EncryptionError: If decryption is not enabled, the token is invalid 148 | (tampered or wrong key), or decryption fails. 149 | TypeError: If input token is not bytes. 150 | """ 151 | if not self.is_enabled or self._fernet is None: 152 | raise EncryptionError("Decryption is not enabled (no password provided).") 153 | if not isinstance(token, bytes): 154 | raise TypeError("Token to decrypt must be bytes.") 155 | 156 | try: 157 | decrypted_data = self._fernet.decrypt(token) 158 | return decrypted_data.decode('utf-8') 159 | except InvalidToken: 160 | msg = "Decryption failed: Invalid token (likely tampered or wrong key)." 161 | ASCIIColors.error(msg) 162 | raise EncryptionError(msg) from InvalidToken # Chain specific error 163 | except Exception as e: 164 | msg = f"Decryption failed: {e}" 165 | ASCIIColors.error(msg, exc_info=True) 166 | raise EncryptionError(msg) from e -------------------------------------------------------------------------------- /safe_store/vectorization/methods/lollms/__init__.py: -------------------------------------------------------------------------------- 1 | # safe_store/vectorization/methods/lollms.py 2 | import numpy as np 3 | from typing import List, Optional, Dict, Any 4 | import os 5 | 6 | from ...base import BaseVectorizer 7 | from ....core.exceptions import ConfigurationError, VectorizationError 8 | from ascii_colors import ASCIIColors 9 | 10 | class_name = "LollmsVectorizer" 11 | 12 | import pipmaster as pm 13 | pm.ensure_packages(["openai"]) 14 | import openai 15 | 16 | _OpenAIAPIError = openai.APIError 17 | _OpenAIAuthenticationError = openai.AuthenticationError 18 | _OpenAINotFoundError = openai.NotFoundError 19 | _OpenAIRateLimitError = openai.RateLimitError 20 | _OpenAIBadRequestError = openai.BadRequestError 21 | _OpenAIAPIConnectionError = openai.APIConnectionError 22 | _OPENAI_AVAILABLE = True 23 | 24 | def list_available_models(**kwargs) -> List[str]: 25 | """ 26 | Dynamically lists models from a running Lollms (OpenAI-compatible) server. 27 | """ 28 | if not _OPENAI_AVAILABLE: 29 | raise ConfigurationError("Lollms support requires 'openai'. Please run: pip install safe_store[openai]") 30 | 31 | base_url = kwargs.get("base_url", "http://localhost:9600") 32 | api_key = kwargs.get("api_key", "not_needed") 33 | 34 | try: 35 | client = openai.Client(base_url=base_url, api_key=api_key) 36 | models = client.models.list() 37 | # The response is an object with a 'data' attribute which is a list of model objects 38 | return [model.id for model in models.data] 39 | except openai.APIConnectionError as e: 40 | raise VectorizationError(f"Could not connect to Lollms server at '{base_url}'. Please ensure it is running.") from e 41 | except Exception as e: 42 | raise VectorizationError(f"An unexpected error occurred while listing Lollms models: {e}") from e 43 | 44 | class LollmsVectorizer(BaseVectorizer): 45 | """ 46 | Vectorizes text using an OpenAI-compatible API, such as a local Lollms instance. 47 | 48 | Requires the `openai` Python library. The `model_config` dictionary specifies 49 | the model name and connection parameters. 50 | Example: 51 | `{"model": "nomic-embed-text", "base_url": "http://localhost:9600", "api_key": "..."}` 52 | 53 | Attributes: 54 | model_name (str): The name of the model to use for embeddings. 55 | api_key (Optional[str]): The API key for the service. 56 | base_url (Optional[str]): The base URL of the OpenAI-compatible API endpoint. 57 | client (openai.OpenAI): The OpenAI client instance. 58 | """ 59 | 60 | def __init__(self, model_config: Dict[str, Any], **kwargs): 61 | """ 62 | Initializes the LollmsVectorizer. 63 | 64 | Args: 65 | model_config: A dictionary with configuration. Must contain "model". 66 | Can also contain "api_key" and "base_url". 67 | 68 | Raises: 69 | ConfigurationError: If 'openai' is not installed or config is invalid. 70 | VectorizationError: If connection or model test fails. 71 | """ 72 | super().__init__( 73 | vectorizer_name="lollms" 74 | ) 75 | if not _OPENAI_AVAILABLE or openai is None: 76 | msg = "LollmsVectorizer requires the 'openai' library. Install with: pip install safe_store[openai]" 77 | raise ConfigurationError(msg) 78 | 79 | if not isinstance(model_config, dict) or "model" not in model_config: 80 | msg = "Lollms vectorizer config must be a dictionary with a 'model' key." 81 | raise ConfigurationError(msg) 82 | 83 | self.model_name: str = model_config["model"] 84 | self.api_key: Optional[str] = model_config.get("api_key", "not_needed") 85 | self.base_url: Optional[str] = model_config.get("base_url") 86 | 87 | ASCIIColors.info(f"Initializing Lollms (OpenAI-compatible) client. Model: {self.model_name}, Base URL: {self.base_url}") 88 | try: 89 | self.client: openai.OpenAI = openai.OpenAI(api_key=self.api_key, base_url=self.base_url) 90 | 91 | ASCIIColors.debug(f"Testing model '{self.model_name}' and retrieving dimension...") 92 | test_prompt = "hello world" 93 | response = self.client.embeddings.create( 94 | model=self.model_name, 95 | input=[test_prompt] 96 | ) 97 | 98 | if not response.data or not response.data[0].embedding: 99 | raise VectorizationError(f"Model '{self.model_name}' did not return a valid embedding. Response: {response}") 100 | 101 | embedding = response.data[0].embedding 102 | 103 | if not isinstance(embedding, list) or not embedding: 104 | raise VectorizationError(f"Model '{self.model_name}' returned an invalid embedding structure. Embedding: {embedding}") 105 | 106 | self._dim = len(embedding) 107 | if self._dim == 0: 108 | raise VectorizationError(f"Model '{self.model_name}' returned a zero-dimension embedding.") 109 | 110 | self._dtype = np.dtype(np.float32) 111 | ASCIIColors.info(f"Model '{self.model_name}' ready. Dimension: {self._dim}, Dtype: {self._dtype.name}") 112 | 113 | except (_OpenAIAuthenticationError, _OpenAINotFoundError, _OpenAIBadRequestError, _OpenAIRateLimitError, _OpenAIAPIConnectionError, _OpenAIAPIError) as e: 114 | msg = (f"API error for model '{self.model_name}': " 115 | f"HTTP {e.http_status if hasattr(e, 'http_status') else 'N/A'} - {e.code if hasattr(e, 'code') else 'N/A'} - {e.message}.") 116 | raise VectorizationError(msg) from e 117 | except Exception as e: 118 | msg = f"Failed to initialize Lollms vectorizer or test model '{self.model_name}': {e}" 119 | raise VectorizationError(msg) from e 120 | 121 | def vectorize(self, texts: List[str]) -> np.ndarray: 122 | """ 123 | Generates vector embeddings for a list of texts. 124 | 125 | Args: 126 | texts: A list of text strings to vectorize. 127 | 128 | Returns: 129 | A 2D NumPy array of embeddings. 130 | 131 | Raises: 132 | VectorizationError: If the embedding process fails. 133 | """ 134 | if not texts: 135 | return np.empty((0, self.dim), dtype=self.dtype) 136 | 137 | ASCIIColors.debug(f"Vectorizing {len(texts)} texts using Lollms model '{self.model_name}'...") 138 | 139 | embeddings_results = [None] * len(texts) 140 | actual_texts_to_embed: List[str] = [] 141 | original_indices_for_api_texts: List[int] = [] 142 | 143 | for i, text in enumerate(texts): 144 | if not text.strip(): 145 | embeddings_results[i] = np.zeros(self.dim, dtype=self._dtype) 146 | else: 147 | actual_texts_to_embed.append(text) 148 | original_indices_for_api_texts.append(i) 149 | 150 | if actual_texts_to_embed: 151 | try: 152 | response = self.client.embeddings.create( 153 | model=self.model_name, 154 | input=actual_texts_to_embed 155 | ) 156 | 157 | if len(response.data) != len(actual_texts_to_embed): 158 | raise VectorizationError(f"API returned {len(response.data)} embeddings for {len(actual_texts_to_embed)} inputs.") 159 | 160 | for i, embedding_data in enumerate(response.data): 161 | original_idx = original_indices_for_api_texts[i] 162 | embedding_vector = embedding_data.embedding 163 | if not isinstance(embedding_vector, list) or len(embedding_vector) != self.dim: 164 | raise VectorizationError(f"Model '{self.model_name}' returned an invalid embedding for text at original index {original_idx}.") 165 | embeddings_results[original_idx] = embedding_vector 166 | 167 | except (_OpenAIBadRequestError, _OpenAIRateLimitError, _OpenAIAPIConnectionError, _OpenAIAPIError) as e: 168 | msg = f"Lollms API error during vectorization: {e.message if hasattr(e, 'message') else str(e)}" 169 | raise VectorizationError(msg) from e 170 | except Exception as e: 171 | msg = f"Unexpected error during Lollms vectorization: {e}" 172 | raise VectorizationError(msg) from e 173 | 174 | embeddings_array = np.array(embeddings_results, dtype=self._dtype) 175 | 176 | if embeddings_array.ndim != 2 or embeddings_array.shape[0] != len(texts) or embeddings_array.shape[1] != self.dim: 177 | raise VectorizationError(f"Vectorization resulted in unexpected shape {embeddings_array.shape}. Expected ({len(texts)}, {self.dim}).") 178 | 179 | ASCIIColors.debug(f"Lollms vectorization complete. Output shape: {embeddings_array.shape}") 180 | return embeddings_array 181 | 182 | @property 183 | def dim(self) -> int: 184 | return self._dim 185 | 186 | @property 187 | def dtype(self) -> np.dtype: 188 | return self._dtype 189 | 190 | @staticmethod 191 | def list_models(**kwargs) -> List[str]: 192 | """Listing models is dependent on the lollms binding and not exposed via client yet.""" 193 | return [] -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | # tests/conftest.py 2 | import pytest 3 | from pathlib import Path 4 | import sqlite3 5 | import shutil 6 | import numpy as np 7 | from unittest.mock import MagicMock 8 | 9 | # Import the class for type hinting 10 | from safe_store import SafeStore, LogLevel 11 | 12 | # --- Fixture Directory --- 13 | FIXTURES_DIR = Path(__file__).parent / "fixtures" 14 | 15 | # --- Dependency Availability Check --- 16 | # Check for sentence-transformers 17 | try: 18 | from sentence_transformers import SentenceTransformer 19 | SENTENCE_TRANSFORMERS_AVAILABLE = True 20 | except ImportError: 21 | SENTENCE_TRANSFORMERS_AVAILABLE = False 22 | SentenceTransformer = None # Define as None if not available 23 | 24 | # Check for scikit-learn 25 | try: 26 | from sklearn.feature_extraction.text import TfidfVectorizer 27 | from sklearn.exceptions import NotFittedError 28 | SKLEARN_AVAILABLE = True 29 | except ImportError: 30 | SKLEARN_AVAILABLE = False 31 | TfidfVectorizer = None 32 | NotFittedError = None 33 | 34 | # --- Global Mocking Fixtures --- 35 | 36 | # Mock SentenceTransformer if not available 37 | if not SENTENCE_TRANSFORMERS_AVAILABLE: 38 | class MockSentenceTransformer: 39 | DEFAULT_MODEL = "mock-st-model" 40 | def __init__(self, model_name): 41 | self.model_name = model_name 42 | self._dim = 384 43 | self._dtype = np.float32 44 | def encode(self, texts, convert_to_numpy=True, show_progress_bar=False): 45 | if not texts: return np.empty((0, self._dim), dtype=self._dtype) 46 | return np.random.rand(len(texts), self._dim).astype(self._dtype) 47 | def get_sentence_embedding_dimension(self): return self._dim 48 | @property 49 | def dim(self): return self._dim 50 | @property 51 | def dtype(self): return self._dtype 52 | 53 | @pytest.fixture(scope="session", autouse=True) 54 | def mock_st_globally(session_mocker): 55 | # Use session_mocker if available, otherwise regular monkeypatch might work in module scope 56 | # Using monkeypatch fixture is generally preferred within test functions/fixtures 57 | # For autouse session scope, directly patching might be necessary if mocker isn't standard 58 | # Let's use monkeypatch fixture within other fixtures instead for safety. 59 | pass # We will apply this mock conditionally in test files or fixtures needing it 60 | 61 | # Mock Scikit-learn if not available 62 | if not SKLEARN_AVAILABLE: 63 | class MockTfidfVectorizer: 64 | def __init__(self, **kwargs): 65 | self.params = kwargs; self._fitted = False; self.vocabulary_ = {}; self.idf_ = np.array([]) 66 | self.dtype = np.float64 67 | if 'dtype' in kwargs: 68 | try: self.dtype = np.dtype(kwargs['dtype']) 69 | except: pass 70 | class MockTfidfInternal: _idf_diag = MagicMock() 71 | self._tfidf = MockTfidfInternal(); self._dim = None 72 | def fit(self, texts): 73 | if not texts: self.vocabulary_ = {}; self.idf_ = np.array([], dtype=self.dtype); self._dim = 0 74 | else: 75 | words = set(w for t in texts for w in t.lower().split()) 76 | self.vocabulary_ = {w: i for i, w in enumerate(sorted(list(words)))} 77 | if not self.vocabulary_: self._dim = 0; self.idf_ = np.array([], dtype=self.dtype) 78 | else: self._dim = len(self.vocabulary_); self.idf_ = np.random.rand(self._dim).astype(self.dtype)*5+1 79 | self._fitted = True 80 | if hasattr(self, '_tfidf') and hasattr(self._tfidf, '_idf_diag'): self._tfidf._idf_diag.dtype = self.dtype 81 | return self 82 | def transform(self, texts): 83 | if not self._fitted: raise (NotFittedError or Exception)("MockTfidfVectorizer not fitted") 84 | if not texts: return MagicMock(**{'toarray.return_value': np.empty((0, self._dim or 0), dtype=self.dtype)}) 85 | num_samples=len(texts); vocab_size=self._dim if self._dim is not None else 0 86 | if vocab_size is None: vocab_size = 0 87 | dense_array = np.random.rand(num_samples, vocab_size).astype(self.dtype) 88 | return MagicMock(**{'toarray.return_value': dense_array, 'shape': dense_array.shape}) 89 | def get_params(self, deep=True): return self.params 90 | @property 91 | def dim(self): return self._dim 92 | 93 | @pytest.fixture(scope="session", autouse=True) 94 | def mock_sklearn_globally(session_mocker): 95 | # Similar caveat as mock_st_globally - apply conditionally where needed 96 | pass 97 | 98 | 99 | # --- Helper to conditionally apply mocks --- 100 | @pytest.fixture(autouse=True) 101 | def apply_mocks_conditionally(monkeypatch): 102 | """Applies mocks only if the libraries are unavailable.""" 103 | if not SENTENCE_TRANSFORMERS_AVAILABLE: 104 | monkeypatch.setattr("safe_store.vectorization.methods.sentence_transformer.SentenceTransformer", MockSentenceTransformer, raising=False) 105 | monkeypatch.setattr("safe_store.vectorization.methods.sentence_transformer._SENTENCE_TRANSFORMERS_AVAILABLE", True, raising=False) # Make wrapper think it's ok 106 | if not SKLEARN_AVAILABLE: 107 | monkeypatch.setattr("safe_store.vectorization.methods.tfidf.TfidfVectorizer", MockTfidfVectorizer, raising=False) 108 | monkeypatch.setattr("safe_store.vectorization.methods.tfidf.NotFittedError", NotFittedError or Exception, raising=False) 109 | monkeypatch.setattr("safe_store.vectorization.methods.tfidf._SKLEARN_AVAILABLE", True, raising=False) # Make wrapper think it's ok 110 | 111 | 112 | # --- Standard Fixtures --- 113 | @pytest.fixture(scope="function") 114 | def temp_db_path(tmp_path: Path) -> Path: 115 | """Provides a path to a temporary database file.""" 116 | return tmp_path / "test_safe_store.db" 117 | 118 | @pytest.fixture(scope="function") 119 | def safe_store_instance(temp_db_path: Path) -> SafeStore: 120 | """Provides a safe_store instance with a clean temporary database.""" 121 | # Ensure clean slate 122 | if temp_db_path.exists(): temp_db_path.unlink() 123 | lock_path = temp_db_path.with_suffix(".db.lock") 124 | if lock_path.exists(): lock_path.unlink() 125 | wal_path = temp_db_path.with_suffix(".db-wal") 126 | if wal_path.exists(): wal_path.unlink(missing_ok=True) 127 | shm_path = temp_db_path.with_suffix(".db-shm") 128 | if shm_path.exists(): shm_path.unlink(missing_ok=True) 129 | 130 | # Use DEBUG level for more verbose test output 131 | store = SafeStore(db_path=temp_db_path, log_level=LogLevel.DEBUG, lock_timeout=0.1) 132 | yield store 133 | store.close() # Ensure closure after test function finishes 134 | 135 | @pytest.fixture(scope="session") 136 | def sample_text_content() -> str: 137 | return "This is the first sentence.\nThis is the second sentence, it is a bit longer.\nAnd a third one." 138 | 139 | @pytest.fixture 140 | def sample_text_file(tmp_path: Path, sample_text_content: str) -> Path: 141 | """Creates a temporary text file.""" 142 | p = tmp_path / "sample.txt" 143 | p.write_text(sample_text_content, encoding='utf-8') 144 | return p 145 | 146 | # --- Phase 3 Fixtures --- 147 | @pytest.fixture 148 | def sample_pdf_file(tmp_path: Path) -> Path: 149 | """Copies the sample PDF to the temp directory.""" 150 | source = FIXTURES_DIR / "sample.pdf" 151 | if not source.exists(): pytest.skip("sample.pdf fixture file not found") 152 | dest = tmp_path / "sample.pdf" 153 | try: shutil.copy(source, dest) 154 | except Exception as e: pytest.fail(f"Failed to copy fixture file {source}: {e}") 155 | return dest 156 | 157 | @pytest.fixture 158 | def sample_docx_file(tmp_path: Path) -> Path: 159 | """Copies the sample DOCX to the temp directory.""" 160 | source = FIXTURES_DIR / "sample.docx" 161 | if not source.exists(): pytest.skip("sample.docx fixture file not found") 162 | dest = tmp_path / "sample.docx" 163 | try: shutil.copy(source, dest) 164 | except Exception as e: pytest.fail(f"Failed to copy fixture file {source}: {e}") 165 | return dest 166 | 167 | @pytest.fixture 168 | def sample_html_file(tmp_path: Path) -> Path: 169 | """Copies the sample HTML to the temp directory.""" 170 | source = FIXTURES_DIR / "sample.html" 171 | if not source.exists(): pytest.skip("sample.html fixture file not found") 172 | dest = tmp_path / "sample.html" 173 | try: shutil.copy(source, dest) 174 | except Exception as e: pytest.fail(f"Failed to copy fixture file {source}: {e}") 175 | return dest 176 | 177 | 178 | # --- Phase 2 Fixture --- 179 | @pytest.fixture 180 | def populated_store(safe_store_instance: SafeStore, sample_text_file: Path, tmp_path: Path) -> SafeStore: 181 | """Provides a safe_store instance with two documents added using the default ST vectorizer.""" 182 | store = safe_store_instance 183 | doc2_content = "Another document.\nWith different content for testing." 184 | doc2_path = tmp_path / "sample2.txt" 185 | doc2_path.write_text(doc2_content, encoding='utf-8') 186 | 187 | # No need for availability check here due to global autouse fixture apply_mocks_conditionally 188 | try: 189 | with store: 190 | store.add_document(sample_text_file, chunk_size=30, chunk_overlap=5) 191 | store.add_document(doc2_path, chunk_size=25, chunk_overlap=5) 192 | except Exception as e: 193 | pytest.fail(f"Populated store fixture setup failed: {e}") 194 | 195 | return store -------------------------------------------------------------------------------- /tests/test_store_phase3.py: -------------------------------------------------------------------------------- 1 | # tests/test_store_phase3.py 2 | import pytest 3 | import sqlite3 4 | from pathlib import Path 5 | from unittest.mock import patch, MagicMock, call 6 | import re 7 | 8 | # Import exceptions and modules 9 | from safe_store import store as safe_store_store_module 10 | from safe_store import SafeStore, LogLevel 11 | from safe_store.core import db 12 | from safe_store.core.exceptions import ConcurrencyError, FileHandlingError, ConfigurationError, SafeStoreError 13 | 14 | # Import filelock components 15 | from filelock import Timeout, FileLock 16 | 17 | # --- FIX: Define availability check locally --- 18 | # Check for parsing libraries availability within this module 19 | try: 20 | import pypdf 21 | import docx 22 | import bs4 23 | import lxml 24 | PARSING_LIBS_AVAILABLE = True 25 | except ImportError: 26 | PARSING_LIBS_AVAILABLE = False 27 | 28 | # --- Test Constants --- 29 | PDF_TEXT = "This is PDF content." 30 | DOCX_TEXT = "This is DOCX content." 31 | HTML_TEXT = "This is HTML content." # Adjusted based on sample file 32 | 33 | # Helper for log checks 34 | def assert_log_call_containing(mock_logger, expected_substring): 35 | """Checks if any call to the mock logger contained the substring.""" 36 | found = False 37 | for call_args in mock_logger.call_args_list: 38 | args, kwargs = call_args 39 | if args and isinstance(args[0], str) and expected_substring in args[0]: 40 | found = True 41 | break 42 | if not found: 43 | for method_call in mock_logger.method_calls: 44 | call_name, args, kwargs = method_call 45 | if args and isinstance(args[0], str) and expected_substring in args[0]: 46 | found = True 47 | break 48 | assert found, f"Expected log call containing '{expected_substring}' not found in {mock_logger.call_args_list} or {mock_logger.method_calls}" 49 | 50 | 51 | # --- Parser Integration Tests --- 52 | 53 | # Use the locally defined PARSING_LIBS_AVAILABLE for skipif 54 | @pytest.mark.skipif(not PARSING_LIBS_AVAILABLE, reason="Requires parsing dependencies (pypdf, python-docx, beautifulsoup4, lxml)") 55 | @patch('safe_store.indexing.parser.ASCIIColors') 56 | @patch('safe_store.store.ASCIIColors') 57 | def test_add_document_pdf(mock_store_colors, mock_parser_colors, safe_store_instance: SafeStore, sample_pdf_file: Path): 58 | """Test adding a PDF document via safe_store.add_document.""" 59 | store = safe_store_instance 60 | with store: 61 | store.add_document(sample_pdf_file, chunk_size=50, chunk_overlap=10) 62 | 63 | assert_log_call_containing(mock_store_colors.info, f"Starting indexing process for: {sample_pdf_file.name}") 64 | assert_log_call_containing(mock_parser_colors.debug, "Dispatching parser for extension '.pdf'") 65 | assert_log_call_containing(mock_parser_colors.debug, f"Attempting to parse PDF file: {sample_pdf_file}") 66 | assert_log_call_containing(mock_store_colors.info, "Generated 1 chunks for") 67 | assert_log_call_containing(mock_store_colors.success, f"Successfully processed '{sample_pdf_file.name}'") 68 | mock_store_colors.error.assert_not_called() 69 | 70 | conn = sqlite3.connect(store.db_path) 71 | cursor = conn.cursor() 72 | cursor.execute("SELECT doc_id, full_text FROM documents WHERE file_path = ?", (str(sample_pdf_file.resolve()),)) 73 | doc_result = cursor.fetchone(); assert doc_result is not None 74 | doc_id = doc_result[0]; parsed_text = doc_result[1] 75 | assert len(parsed_text) > 5; assert "pdf" in parsed_text.lower() 76 | cursor.execute("SELECT COUNT(*) FROM chunks WHERE doc_id = ?", (doc_id,)) 77 | chunk_count = cursor.fetchone()[0]; assert chunk_count == 1 78 | cursor.execute("SELECT COUNT(v.vector_id) FROM vectors v JOIN chunks c ON v.chunk_id = c.chunk_id WHERE c.doc_id = ?", (doc_id,)) 79 | vector_count = cursor.fetchone()[0]; assert vector_count == 1 80 | conn.close() 81 | 82 | 83 | @pytest.mark.skipif(not PARSING_LIBS_AVAILABLE, reason="Requires parsing dependencies (pypdf, python-docx, beautifulsoup4, lxml)") 84 | @patch('safe_store.indexing.parser.ASCIIColors') 85 | @patch('safe_store.store.ASCIIColors') 86 | def test_add_document_docx(mock_store_colors, mock_parser_colors, safe_store_instance: SafeStore, sample_docx_file: Path): 87 | """Test adding a DOCX document via safe_store.add_document.""" 88 | store = safe_store_instance 89 | with store: 90 | store.add_document(sample_docx_file, chunk_size=50, chunk_overlap=10) 91 | 92 | assert_log_call_containing(mock_store_colors.info, f"Starting indexing process for: {sample_docx_file.name}") 93 | assert_log_call_containing(mock_parser_colors.debug, "Dispatching parser for extension '.docx'") 94 | assert_log_call_containing(mock_parser_colors.debug, f"Attempting to parse DOCX file: {sample_docx_file}") 95 | assert_log_call_containing(mock_store_colors.info, "Generated 1 chunks for") 96 | assert_log_call_containing(mock_store_colors.success, f"Successfully processed '{sample_docx_file.name}'") 97 | mock_store_colors.error.assert_not_called() 98 | 99 | conn = sqlite3.connect(store.db_path) 100 | cursor = conn.cursor() 101 | cursor.execute("SELECT doc_id, full_text FROM documents WHERE file_path = ?", (str(sample_docx_file.resolve()),)) 102 | doc_result = cursor.fetchone(); assert doc_result is not None 103 | doc_id = doc_result[0]; assert DOCX_TEXT == doc_result[1].strip() 104 | cursor.execute("SELECT COUNT(*) FROM chunks WHERE doc_id = ?", (doc_id,)) 105 | chunk_count = cursor.fetchone()[0]; assert chunk_count == 1 106 | cursor.execute("SELECT COUNT(v.vector_id) FROM vectors v JOIN chunks c ON v.chunk_id = c.chunk_id WHERE c.doc_id = ?", (doc_id,)) 107 | vector_count = cursor.fetchone()[0]; assert vector_count == 1 108 | conn.close() 109 | 110 | 111 | @pytest.mark.skipif(not PARSING_LIBS_AVAILABLE, reason="Requires parsing dependencies (pypdf, python-docx, beautifulsoup4, lxml)") 112 | @patch('safe_store.indexing.parser.ASCIIColors') 113 | @patch('safe_store.store.ASCIIColors') 114 | def test_add_document_html(mock_store_colors, mock_parser_colors, safe_store_instance: SafeStore, sample_html_file: Path): 115 | """Test adding an HTML document via safe_store.add_document.""" 116 | store = safe_store_instance 117 | with store: 118 | store.add_document(sample_html_file, chunk_size=50, chunk_overlap=10) 119 | 120 | assert_log_call_containing(mock_store_colors.info, f"Starting indexing process for: {sample_html_file.name}") 121 | assert_log_call_containing(mock_parser_colors.debug, "Dispatching parser for extension '.html'") 122 | assert_log_call_containing(mock_parser_colors.debug, f"Attempting to parse HTML file: {sample_html_file}") 123 | assert_log_call_containing(mock_store_colors.info, "Generated 1 chunks for") 124 | assert_log_call_containing(mock_store_colors.success, f"Successfully processed '{sample_html_file.name}'") 125 | mock_store_colors.error.assert_not_called() 126 | 127 | conn = sqlite3.connect(store.db_path) 128 | cursor = conn.cursor() 129 | cursor.execute("SELECT doc_id, full_text FROM documents WHERE file_path = ?", (str(sample_html_file.resolve()),)) 130 | doc_result = cursor.fetchone(); assert doc_result is not None 131 | doc_id = doc_result[0]; assert HTML_TEXT == doc_result[1].strip() 132 | cursor.execute("SELECT COUNT(*) FROM chunks WHERE doc_id = ?", (doc_id,)) 133 | chunk_count = cursor.fetchone()[0]; assert chunk_count == 1 134 | cursor.execute("SELECT COUNT(v.vector_id) FROM vectors v JOIN chunks c ON v.chunk_id = c.chunk_id WHERE c.doc_id = ?", (doc_id,)) 135 | vector_count = cursor.fetchone()[0]; assert vector_count == 1 136 | conn.close() 137 | 138 | # --- Concurrency Tests --- 139 | @patch('safe_store.store.ASCIIColors') 140 | def test_add_document_lock_acquired(mock_store_colors, safe_store_instance: SafeStore, sample_text_file: Path): 141 | """Test that add_document acquires and releases the lock.""" 142 | store = safe_store_instance 143 | with patch.object(store, '_file_lock', autospec=True) as mock_lock_instance: 144 | mock_lock_instance.__enter__.return_value = None 145 | mock_lock_instance.__exit__.return_value = None 146 | with store: 147 | store.add_document(sample_text_file) 148 | mock_lock_instance.__enter__.assert_called_once() 149 | mock_lock_instance.__exit__.assert_called_once() 150 | 151 | assert_log_call_containing(mock_store_colors.debug, "Attempting to acquire write lock for add_document") 152 | assert_log_call_containing(mock_store_colors.info, "Write lock acquired for add_document") 153 | assert_log_call_containing(mock_store_colors.debug, f"Write lock released for add_document: {sample_text_file.name}") 154 | assert_log_call_containing(mock_store_colors.success, f"Successfully processed '{sample_text_file.name}'") 155 | 156 | 157 | @patch('safe_store.store.ASCIIColors') 158 | def test_add_document_lock_timeout(mock_store_colors, safe_store_instance: SafeStore, sample_text_file: Path): 159 | """Test that add_document handles a lock timeout.""" 160 | store = safe_store_instance 161 | mock_lock_instance = MagicMock(spec=FileLock) 162 | timeout_exception = Timeout(store.lock_path) 163 | mock_lock_instance.__enter__.side_effect = timeout_exception 164 | 165 | with patch.object(store, '_file_lock', mock_lock_instance): 166 | expected_error_msg = f"Timeout ({store.lock_timeout}s) acquiring write lock for add_document: {sample_text_file.name}" 167 | with pytest.raises(ConcurrencyError, match=re.escape(expected_error_msg)): 168 | store.add_document(sample_text_file) 169 | mock_lock_instance.__enter__.assert_called_once() 170 | mock_lock_instance.__exit__.assert_not_called() 171 | 172 | assert_log_call_containing(mock_store_colors.debug, "Attempting to acquire write lock for add_document") 173 | assert_log_call_containing(mock_store_colors.error, expected_error_msg) 174 | mock_store_colors.success.assert_not_called() 175 | 176 | -------------------------------------------------------------------------------- /point_cloud_web_app/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | SafeStore | 2D Chunk Visualization 8 | 9 | 10 | 17 | 18 | 19 | 20 |
21 | 22 |
23 |

24 | 2D Document Chunk Visualization 25 |

26 |

27 | An interactive PCA plot of vectorized document chunks. Each point represents a piece of text, clustered by semantic similarity. Powered by SafeStore. 28 |

29 |
30 | 31 | 32 |
33 | 34 | 35 |
36 |
37 |
38 | 39 | 40 |
41 |

Chunk Inspector

42 |
43 | 44 |
45 |
46 |
47 |
48 | 49 | 167 | 168 | 169 | -------------------------------------------------------------------------------- /safe_store/vectorization/methods/cohere/__init__.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from typing import List, Optional, Dict, Any 3 | import os 4 | 5 | from ...base import BaseVectorizer 6 | from ....core.exceptions import ConfigurationError, VectorizationError 7 | from ascii_colors import ASCIIColors 8 | 9 | # each vectorizer must have a class name variable to be identified 10 | class_name = "CohereVectorizer" 11 | 12 | def list_available_models(**kwargs) -> List[str]: 13 | """ 14 | Returns a static list of common Cohere embedding models. 15 | """ 16 | return ["embed-english-v3.0", "embed-english-light-v3.0", "embed-multilingual-v3.0"] 17 | 18 | # Attempt import of cohere and related error types, handle gracefully 19 | try: 20 | import pipmaster as pm 21 | pm.ensure_packages(["cohere"]) # Ensure cohere is installed 22 | import safe_store.vectorization.methods.cohere as cohere 23 | _CohereAPIError = cohere.APIError 24 | _CohereConnectionError = cohere.ConnectionError 25 | _COHERE_AVAILABLE = True 26 | except ImportError: 27 | _COHERE_AVAILABLE = False 28 | cohere = None 29 | class _CohereAPIError(Exception): pass 30 | class _CohereConnectionError(Exception): pass 31 | 32 | 33 | class CohereVectorizer(BaseVectorizer): 34 | """ 35 | Vectorizes text using models from Cohere via their API. 36 | 37 | Requires the `cohere` Python library and a Cohere API key. The key can be 38 | provided in the `model_config` dictionary or via the `COHERE_API_KEY` 39 | environment variable. 40 | 41 | Attributes: 42 | model_name (str): The name of the Cohere model to use. 43 | api_key (str): The Cohere API key being used. 44 | client (cohere.Client): The Cohere client instance. 45 | input_type (str): The input type for the embedding model. 46 | truncate (str): The truncation strategy for the model. 47 | """ 48 | DEFAULT_INPUT_TYPE = "search_document" 49 | DEFAULT_TRUNCATE = "END" 50 | 51 | def __init__(self, 52 | model_config: Dict[str, Any], 53 | **kwargs): 54 | """ 55 | Initializes the CohereVectorizer. 56 | 57 | Args: 58 | model_config: A dictionary containing the vectorizer's configuration. 59 | - "model" (str): Mandatory. The name of the model to use. 60 | - "api_key" (str): Optional. Your Cohere API key. If not 61 | provided, the COHERE_API_KEY environment variable is used. 62 | - "input_type" (str): Optional. E.g., "search_document". 63 | - "truncate" (str): Optional. E.g., "END". 64 | 65 | Raises: 66 | ConfigurationError: If 'cohere' is not installed, config is invalid, 67 | or the API key is missing. 68 | VectorizationError: If connection to Cohere fails or the model is invalid. 69 | """ 70 | super().__init__(vectorizer_name="cohere") 71 | if not _COHERE_AVAILABLE or cohere is None: 72 | raise ConfigurationError("CohereVectorizer requires the 'cohere' library. Install with: pip install safe_store[cohere]") 73 | 74 | if not isinstance(model_config, dict) or "model" not in model_config: 75 | raise ConfigurationError("Cohere vectorizer config must be a dictionary with a 'model' key.") 76 | 77 | self.model_name: str = model_config["model"] 78 | 79 | # API key discovery logic 80 | chosen_api_key: Optional[str] = model_config.get("api_key") 81 | if chosen_api_key: 82 | ASCIIColors.info("Using Cohere API key provided in vectorizer_config.") 83 | else: 84 | ASCIIColors.info("API key not in config. Checking COHERE_API_KEY environment variable.") 85 | chosen_api_key = os.environ.get("COHERE_API_KEY") 86 | 87 | if not chosen_api_key: 88 | raise ConfigurationError("Cohere API key not found. Provide it in the 'api_key' field of vectorizer_config or set the COHERE_API_KEY environment variable.") 89 | 90 | self.api_key: str = chosen_api_key 91 | 92 | # Get additional parameters from config 93 | self.input_type: str = model_config.get("input_type", self.DEFAULT_INPUT_TYPE) 94 | self.truncate: str = model_config.get("truncate", self.DEFAULT_TRUNCATE) 95 | 96 | # Parameter validation 97 | valid_input_types = ["search_document", "search_query", "classification", "clustering", "rerank"] 98 | if self.input_type not in valid_input_types: 99 | ASCIIColors.warning(f"Invalid input_type '{self.input_type}'. Defaulting to '{self.DEFAULT_INPUT_TYPE}'.") 100 | self.input_type = self.DEFAULT_INPUT_TYPE 101 | 102 | valid_truncate_types = ["NONE", "START", "END"] 103 | if self.truncate not in valid_truncate_types: 104 | ASCIIColors.warning(f"Invalid truncate type '{self.truncate}'. Defaulting to '{self.DEFAULT_TRUNCATE}'.") 105 | self.truncate = self.DEFAULT_TRUNCATE 106 | 107 | ASCIIColors.info(f"Initializing Cohere client. Model: {self.model_name}, Input Type: {self.input_type}") 108 | try: 109 | self.client: cohere.Client = cohere.Client(api_key=self.api_key) 110 | 111 | # Test connection and get embedding dimension 112 | test_prompt = "hello world" 113 | response = self.client.embed( 114 | texts=[test_prompt], 115 | model=self.model_name, 116 | input_type=self.input_type, 117 | truncate=self.truncate 118 | ) 119 | 120 | if not hasattr(response, 'embeddings') or not response.embeddings or not response.embeddings[0]: 121 | raise VectorizationError(f"Cohere model '{self.model_name}' did not return valid embeddings.") 122 | 123 | embedding = response.embeddings[0] 124 | 125 | self._dim = len(embedding) 126 | if self._dim == 0: 127 | raise VectorizationError(f"Cohere model '{self.model_name}' returned a zero-dimension embedding.") 128 | 129 | self._dtype = np.dtype(np.float32) 130 | ASCIIColors.info(f"Cohere model '{self.model_name}' ready. Dimension: {self._dim}") 131 | 132 | except _CohereAPIError as e: 133 | msg = f"Cohere API error for model '{self.model_name}': {e}. Check API key and model name." 134 | if hasattr(e, 'http_status') and e.http_status in [401, 403, 404]: 135 | raise ConfigurationError(msg) from e 136 | raise VectorizationError(msg) from e 137 | except _CohereConnectionError as e: 138 | raise VectorizationError(f"Cohere connection error for model '{self.model_name}': {e}.") from e 139 | except Exception as e: 140 | raise VectorizationError(f"Failed to initialize Cohere vectorizer '{self.model_name}': {e}") from e 141 | 142 | def vectorize(self, texts: List[str]) -> np.ndarray: 143 | if not texts: 144 | return np.empty((0, self.dim), dtype=self.dtype) 145 | 146 | embeddings_results = [None] * len(texts) 147 | actual_texts_to_embed: List[str] = [] 148 | original_indices_for_api_texts: List[int] = [] 149 | 150 | for i, text in enumerate(texts): 151 | stripped_text = text.strip() 152 | if not stripped_text: 153 | embeddings_results[i] = np.zeros(self.dim, dtype=self._dtype) 154 | else: 155 | actual_texts_to_embed.append(stripped_text) 156 | original_indices_for_api_texts.append(i) 157 | 158 | if actual_texts_to_embed: 159 | try: 160 | if len(actual_texts_to_embed) > 96: 161 | ASCIIColors.warning(f"Attempting to vectorize {len(actual_texts_to_embed)} texts with Cohere, which may exceed batch limits.") 162 | 163 | response = self.client.embed( 164 | texts=actual_texts_to_embed, 165 | model=self.model_name, 166 | input_type=self.input_type, 167 | truncate=self.truncate 168 | ) 169 | 170 | if not hasattr(response, 'embeddings') or len(response.embeddings) != len(actual_texts_to_embed): 171 | raise VectorizationError("Cohere API returned a mismatched number of embeddings.") 172 | 173 | for i, embedding_vector in enumerate(response.embeddings): 174 | original_idx = original_indices_for_api_texts[i] 175 | if not isinstance(embedding_vector, list) or len(embedding_vector) != self.dim: 176 | raise VectorizationError(f"Cohere model returned an invalid embedding for text at index {original_idx}.") 177 | embeddings_results[original_idx] = embedding_vector 178 | 179 | except (_CohereAPIError, _CohereConnectionError) as e: 180 | raise VectorizationError(f"Cohere API error during vectorization: {e}") from e 181 | except Exception as e: 182 | raise VectorizationError(f"Unexpected error during Cohere vectorization: {e}") from e 183 | 184 | embeddings_array = np.array(embeddings_results, dtype=self._dtype) 185 | 186 | if embeddings_array.shape != (len(texts), self.dim): 187 | raise VectorizationError(f"Cohere vectorization resulted in unexpected shape {embeddings_array.shape}.") 188 | 189 | return embeddings_array 190 | 191 | @property 192 | def dim(self) -> int: 193 | return self._dim 194 | 195 | @property 196 | def dtype(self) -> np.dtype: 197 | return self._dtype 198 | 199 | @staticmethod 200 | def list_models(**kwargs) -> List[str]: 201 | """Lists available embedding models from Cohere.""" 202 | return [ 203 | "embed-english-v3.0", 204 | "embed-multilingual-v3.0", 205 | "embed-english-light-v3.0", 206 | "embed-multilingual-light-v3.0", 207 | "embed-english-v2.0", 208 | "embed-english-light-v2.0", 209 | "embed-multilingual-v2.0", 210 | ] -------------------------------------------------------------------------------- /examples/basic_usage.py: -------------------------------------------------------------------------------- 1 | # examples/basic_usage.py 2 | import safe_store 3 | from pathlib import Path 4 | import time 5 | import shutil 6 | 7 | # --- Configuration --- 8 | # Activate or deactivate examples for each vectorizer type. 9 | # Each example will create its own separate database file. 10 | USE_ST = True # Sentence-Transformers (local model) 11 | USE_TFIDF = True # TF-IDF (local, data-dependent) 12 | USE_OLLAMA = True # Ollama (requires running Ollama server) 13 | USE_OPENAI = False # OpenAI (requires API key) 14 | USE_COHERE = False # Cohere (requires API key) 15 | USE_PARSING = True # Set to False if parsing libs not installed 16 | 17 | # --- Vectorizer Configurations --- 18 | # The new way: define vectorizer type and its config separately. 19 | st_config = {"model": "all-MiniLM-L6-v2"} 20 | tfidf_config = {"name": "my_tfidf"} # 'name' is just an identifier for this fitted model 21 | ollama_config = {"model": "qwen3-embedding:0.6b"} # Ensure you have pulled this model in Ollama 22 | openai_config = {"model": "text-embedding-3-small"} # Key from OPENAI_API_KEY env var 23 | cohere_config = {"model": "embed-english-v3.0"} # Key from COHERE_API_KEY env var 24 | 25 | # --- Helper Functions --- 26 | def print_header(title): 27 | print("\n" + "="*10 + f" {title} " + "="*10) 28 | 29 | def cleanup_db_files(db_file): 30 | """Cleans up only the database and its associated files.""" 31 | db_path = Path(db_file) 32 | paths_to_delete = [ 33 | db_path, 34 | Path(f"{db_path}.lock"), 35 | Path(f"{db_path}-wal"), 36 | Path(f"{db_path}-shm") 37 | ] 38 | for p in paths_to_delete: 39 | p.unlink(missing_ok=True) 40 | print(f"- Cleaned up database artifacts for {db_file}") 41 | 42 | # --- Document Preparation --- 43 | def prepare_documents(doc_dir="temp_docs_basic"): 44 | DOC_DIR = Path(doc_dir) 45 | # Clean up and recreate the directory from scratch at the beginning 46 | if DOC_DIR.exists(): 47 | shutil.rmtree(DOC_DIR) 48 | print_header("Preparing Sample Documents") 49 | DOC_DIR.mkdir(exist_ok=True) 50 | 51 | (DOC_DIR / "intro.txt").write_text( 52 | "safe_store is a Python library for local vector storage.", encoding='utf-8' 53 | ) 54 | (DOC_DIR / "update_later.txt").write_text( 55 | "Initial content for update testing.", encoding='utf-8' 56 | ) 57 | if USE_PARSING: 58 | (DOC_DIR / "web_snippet.html").write_text( 59 | "

Efficient retrieval is crucial for RAG pipelines.

", 60 | encoding='utf-8' 61 | ) 62 | print(f"- Documents created in: {DOC_DIR.resolve()}") 63 | 64 | # --- Main Script --- 65 | if __name__ == "__main__": 66 | # --- Discover and Print Available Vectorizers --- 67 | print_header("Discovering Available Vectorizers") 68 | available_vectorizers = safe_store.SafeStore.list_available_vectorizers() 69 | for vec in available_vectorizers: 70 | print(f"\n- Vectorizer: {vec['name']} ({vec.get('title', 'No Title')})") 71 | print(f" Description: {vec.get('description', 'N/A').strip()}") 72 | if vec.get('input_parameters'): 73 | print(" Parameters:") 74 | for param in vec['input_parameters']: 75 | default_val = f" (default: {param['default']})" if 'default' in param else "" 76 | mandatory_flag = "[MANDATORY]" if param.get('mandatory') else "[OPTIONAL]" 77 | print(f" - {param['name']}: {param.get('description', 'N/A')} {mandatory_flag}{default_val}") 78 | 79 | DOC_DIR = Path("temp_docs_basic") 80 | prepare_documents(DOC_DIR) 81 | 82 | # --- Example 1: Sentence Transformer (ST) --- 83 | if USE_ST: 84 | db_file_st = "st_store.db" 85 | print_header(f"Sentence Transformer Example (DB: {db_file_st})") 86 | cleanup_db_files(db_file_st) 87 | try: 88 | store_st = safe_store.SafeStore( 89 | db_path=db_file_st, 90 | vectorizer_name="st", 91 | vectorizer_config=st_config, 92 | log_level=safe_store.LogLevel.INFO 93 | ) 94 | with store_st: 95 | store_st.add_document(DOC_DIR / "intro.txt", metadata={"topic": "introduction"}) 96 | if USE_PARSING: 97 | store_st.add_document(DOC_DIR / "web_snippet.html", metadata={"source": "web"}) 98 | 99 | results_st = store_st.query("local database library", top_k=1) 100 | if results_st: 101 | res = results_st[0] 102 | print(f" Query Result: Score={res['similarity_percent']:.2f}%, Text='{res['chunk_text'][:60]}...'") 103 | 104 | # NEW: Demonstrate vectorizing with metadata 105 | print("\n Demonstrating vectorization with metadata...") 106 | store_st.add_text( 107 | unique_id="metadata_vectorization_test", 108 | text="This text is about oranges and lemons.", 109 | metadata={"topic": "citrus fruits", "author": "test"}, 110 | vectorize_with_metadata=True, # This is the new option 111 | force_reindex=True 112 | ) 113 | # This query should be more similar to the metadata ("citrus") than the other documents. 114 | results_meta = store_st.query("information about citrus", top_k=1) 115 | if results_meta: 116 | res = results_meta[0] 117 | print(f" Querying with metadata context ('citrus'): Score={res['similarity_percent']:.2f}%, Path='{res['file_path']}'") 118 | if res['file_path'] == 'metadata_vectorization_test': 119 | print(" SUCCESS: The most relevant result came from the document with vectorized metadata.") 120 | else: 121 | print(" NOTE: The top result was not the one with vectorized metadata, which might happen with some models.") 122 | 123 | print("\n Demonstrating file update...") 124 | (DOC_DIR / "update_later.txt").write_text("This content is new and improved for re-indexing.") 125 | store_st.add_document(DOC_DIR / "update_later.txt", force_reindex=True) 126 | print(" 'update_later.txt' has been re-indexed.") 127 | 128 | except safe_store.ConfigurationError as e: 129 | print(f" [SKIP] Could not run ST example: {e}") 130 | except Exception as e: 131 | print(f" [ERROR] An unexpected error occurred: {e}") 132 | 133 | # --- Example 2: TF-IDF --- 134 | if USE_TFIDF: 135 | db_file_tfidf = "tfidf_store.db" 136 | print_header(f"TF-IDF Example (DB: {db_file_tfidf})") 137 | cleanup_db_files(db_file_tfidf) 138 | try: 139 | store_tfidf = safe_store.SafeStore( 140 | db_path=db_file_tfidf, 141 | vectorizer_name="tfidf", 142 | vectorizer_config=tfidf_config, 143 | chunking_strategy='character' 144 | ) 145 | with store_tfidf: 146 | print(" Adding documents (this will fit the TF-IDF model)...") 147 | store_tfidf.add_document(DOC_DIR / "intro.txt") 148 | if USE_PARSING: 149 | store_tfidf.add_document(DOC_DIR / "web_snippet.html") 150 | 151 | results_tfidf = store_tfidf.query("SQLite backend storage", top_k=1) 152 | if results_tfidf: 153 | res = results_tfidf[0] 154 | print(f" Query Result: Score={res['similarity_percent']:.2f}%, Text='{res['chunk_text'][:60]}...'") 155 | 156 | except safe_store.ConfigurationError as e: 157 | print(f" [SKIP] Could not run TF-IDF example: {e}") 158 | except Exception as e: 159 | print(f" [ERROR] An unexpected error occurred: {e}") 160 | 161 | # --- Example 3: Ollama --- 162 | if USE_OLLAMA: 163 | db_file_ollama = "ollama_store.db" 164 | print_header(f"Ollama Example with Custom Tokenizer (DB: {db_file_ollama})") 165 | cleanup_db_files(db_file_ollama) 166 | try: 167 | available_models = safe_store.SafeStore.list_models("ollama") 168 | print(f" Found Ollama models: {available_models}") 169 | if ollama_config["model"] not in available_models: 170 | print(f" [SKIP] Model '{ollama_config['model']}' not found in Ollama.") 171 | else: 172 | store_ollama = safe_store.SafeStore( 173 | db_path=db_file_ollama, 174 | vectorizer_name="ollama", 175 | vectorizer_config=ollama_config, 176 | # --- NOUVEAUTÉ : Utiliser le chunking par token en fournissant un tokenizer personnalisé --- 177 | chunking_strategy='token', 178 | custom_tokenizer={"name": "tiktoken", "model": "cl100k_base"} 179 | ) 180 | with store_ollama: 181 | store_ollama.add_document(DOC_DIR / "intro.txt") 182 | results_ollama = store_ollama.query("file-based vector db", top_k=1) 183 | if results_ollama: 184 | res = results_ollama[0] 185 | print(f" Query Result: Score={res['similarity_percent']:.2f}%, Text='{res['chunk_text'][:60]}...'") 186 | 187 | except safe_store.VectorizationError as e: 188 | print(f" [SKIP] Could not connect to Ollama server: {e}") 189 | except Exception as e: 190 | print(f" [ERROR] An unexpected error occurred: {e}") 191 | 192 | 193 | # --- API-based examples --- 194 | if USE_OPENAI: 195 | db_file_openai = "openai_store.db" 196 | print_header(f"OpenAI Example (DB: {db_file_openai})") 197 | cleanup_db_files(db_file_openai) 198 | try: 199 | store_openai = safe_store.SafeStore( 200 | db_path=db_file_openai, 201 | vectorizer_name="openai", 202 | vectorizer_config=openai_config, 203 | chunking_strategy='character' # Also required for OpenAI 204 | ) 205 | with store_openai: 206 | store_openai.add_document(DOC_DIR / "intro.txt") 207 | results_openai = store_openai.query("python tool for embeddings", top_k=1) 208 | if results_openai: 209 | print(f" Query Result: Score={results_openai[0]['similarity_percent']:.2f}%") 210 | except Exception as e: 211 | print(f" [ERROR] OpenAI example failed: {e}") 212 | 213 | if USE_COHERE: 214 | db_file_cohere = "cohere_store.db" 215 | print_header(f"Cohere Example (DB: {db_file_cohere})") 216 | cleanup_db_files(db_file_cohere) 217 | try: 218 | store_cohere = safe_store.SafeStore( 219 | db_path=db_file_cohere, 220 | vectorizer_name="cohere", 221 | vectorizer_config=cohere_config, 222 | chunking_strategy='character' # Also required for Cohere 223 | ) 224 | with store_cohere: 225 | store_cohere.add_document(DOC_DIR / "intro.txt") 226 | results_cohere = store_cohere.query("library for vector search", top_k=1) 227 | if results_cohere: 228 | print(f" Query Result: Score={results_cohere[0]['similarity_percent']:.2f}%") 229 | except Exception as e: 230 | print(f" [ERROR] Cohere example failed: {e}") 231 | 232 | print("\n--- Final Cleanup ---") 233 | if DOC_DIR.exists(): 234 | shutil.rmtree(DOC_DIR) 235 | print(f"- Removed directory: {DOC_DIR}") 236 | 237 | print("\n--- End of Script ---") -------------------------------------------------------------------------------- /scripts/migration_v1_v2.py: -------------------------------------------------------------------------------- 1 | # migrate_v1_to_v2_argparse.py 2 | import sqlite3 3 | from pathlib import Path 4 | from typing import Union, Optional, Any 5 | import argparse 6 | from ascii_colors import ASCIIColors 7 | 8 | # --- DatabaseError and connect_db remain the same --- 9 | class DatabaseError(Exception): 10 | pass 11 | 12 | def connect_db(db_path: Union[str, Path]) -> sqlite3.Connection: 13 | db_path_obj = Path(db_path).resolve() 14 | try: 15 | db_path_obj.parent.mkdir(parents=True, exist_ok=True) 16 | conn = sqlite3.connect( 17 | str(db_path_obj), 18 | detect_types=sqlite3.PARSE_DECLTYPES, 19 | check_same_thread=False 20 | ) 21 | conn.execute("PRAGMA journal_mode=WAL;") 22 | conn.execute("PRAGMA foreign_keys = ON;") 23 | ASCIIColors.debug(f"Connected to database: {db_path_obj} (WAL enabled)") 24 | return conn 25 | except sqlite3.Error as e: 26 | msg = f"Database connection error to {db_path_obj}: {e}" 27 | ASCIIColors.error(msg, exc_info=True) 28 | raise DatabaseError(msg) from e 29 | 30 | # --- set_store_metadata and get_store_metadata remain the same --- 31 | def set_store_metadata(conn: sqlite3.Connection, key: str, value: str) -> None: 32 | sql = "INSERT OR REPLACE INTO store_metadata (key, value) VALUES (?, ?)" 33 | cursor = conn.cursor() 34 | try: 35 | cursor.execute(sql, (key, value)) 36 | ASCIIColors.debug(f"Set store_metadata: {key} = {value}") 37 | except sqlite3.Error as e: 38 | msg = f"Error setting store metadata '{key}': {e}" 39 | ASCIIColors.error(msg, exc_info=True) 40 | raise DatabaseError(msg) from e 41 | 42 | def get_store_metadata(conn: sqlite3.Connection, key: str) -> Optional[str]: 43 | cursor = conn.cursor() 44 | try: 45 | cursor.execute("SELECT 1 FROM sqlite_master WHERE type='table' AND name='store_metadata';") 46 | if not cursor.fetchone(): 47 | return None 48 | 49 | sql = "SELECT value FROM store_metadata WHERE key = ?" 50 | cursor.execute(sql, (key,)) 51 | result = cursor.fetchone() 52 | return result[0] if result else None 53 | except sqlite3.Error as e: 54 | ASCIIColors.warning(f"Could not get store metadata for key '{key}' (may not exist yet): {e}") 55 | return None 56 | 57 | def table_exists(cursor: sqlite3.Cursor, table_name: str) -> bool: 58 | """Checks if a table exists in the database.""" 59 | cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name=?;", (table_name,)) 60 | return cursor.fetchone() is not None 61 | 62 | def migrate_v1_to_v2(db_path: Path, auto_yes: bool = False): 63 | """ 64 | Migrates the SafeStore database from v1.0 schema to v2.0 schema. 65 | Adds graph-related tables and columns. 66 | 67 | Args: 68 | db_path: Path object to the database file. 69 | auto_yes: If True, skips interactive prompts. 70 | """ 71 | ASCIIColors.info(f"Attempting migration for database: {db_path}") 72 | 73 | if not db_path.exists(): 74 | ASCIIColors.error(f"Database file {db_path} does not exist. Cannot migrate.") 75 | ASCIIColors.info("If this is a new setup, the main application will initialize it to v2.0.") 76 | return False 77 | 78 | if not auto_yes: 79 | ASCIIColors.warning("IMPORTANT: Please backup your database file before proceeding!") 80 | try: 81 | if not Path("/dev/tty").is_char_device(): 82 | ASCIIColors.info("Non-interactive environment detected, proceeding without prompt.") 83 | elif input("Press Enter to continue or Ctrl+C to abort..."): 84 | ASCIIColors.info("Migration aborted by user input.") 85 | return False 86 | except (EOFError, KeyboardInterrupt): 87 | ASCIIColors.info("Migration aborted.") 88 | return False 89 | except Exception: 90 | ASCIIColors.info("Could not get interactive input, proceeding with caution. Use --yes to bypass.") 91 | 92 | conn = None 93 | try: 94 | conn = connect_db(db_path) 95 | cursor = conn.cursor() 96 | 97 | # --- Pre-migration V1 Schema Check --- 98 | ASCIIColors.info("Performing pre-migration schema check...") 99 | required_v1_tables = ["documents", "vectorization_methods", "chunks", "vectors"] 100 | missing_v1_tables = [] 101 | for table_name in required_v1_tables: 102 | if not table_exists(cursor, table_name): 103 | missing_v1_tables.append(table_name) 104 | 105 | if missing_v1_tables: 106 | ASCIIColors.error(f"The database at '{db_path}' is missing essential v1.0 tables: {', '.join(missing_v1_tables)}.") 107 | ASCIIColors.error("This script expects a database with a valid v1.0 schema.") 108 | ASCIIColors.info("If this is an empty database, your application should initialize it directly to v2.0.") 109 | return False 110 | ASCIIColors.green("Basic v1.0 schema tables found.") 111 | 112 | 113 | # --- Version Check (after confirming basic tables exist) --- 114 | current_version = get_store_metadata(conn, 'schema_version') 115 | if current_version == '2.0': 116 | ASCIIColors.success(f"Database '{db_path}' is already at schema version 2.0. No migration needed.") 117 | return True 118 | elif current_version: 119 | ASCIIColors.warning(f"Database '{db_path}' has an existing schema version: '{current_version}'.") 120 | ASCIIColors.warning("This script is designed for v1.0 (no version marker) to v2.0 migration.") 121 | if not auto_yes: 122 | if input(f"Continue migration from '{current_version}' to '2.0'? (yes/NO): ").lower() != 'yes': 123 | ASCIIColors.info("Migration aborted by user.") 124 | return False 125 | else: 126 | ASCIIColors.info(f"Auto-proceeding with migration from '{current_version}' to '2.0'.") 127 | else: 128 | ASCIIColors.info("No schema_version metadata found. Assuming v1.0 database.") 129 | 130 | 131 | ASCIIColors.info("Proceeding with v1.0 to v2.0 migration tasks...") 132 | 133 | cursor.execute("PRAGMA foreign_keys=OFF;") 134 | 135 | # 1. Add 'graph_processed_at' column and index to 'chunks' table 136 | ASCIIColors.info("Updating 'chunks' table (guaranteed to exist by pre-check)...") 137 | cursor.execute("PRAGMA table_info(chunks);") 138 | columns_in_chunks = [info[1] for info in cursor.fetchall()] 139 | if 'graph_processed_at' not in columns_in_chunks: 140 | cursor.execute("ALTER TABLE chunks ADD COLUMN graph_processed_at DATETIME;") 141 | ASCIIColors.info("Added 'graph_processed_at' column to 'chunks'.") 142 | cursor.execute("CREATE INDEX IF NOT EXISTS idx_chunk_graph_processed_at ON chunks (graph_processed_at);") 143 | ASCIIColors.green("'chunks' table updated and indexed.") 144 | 145 | # 2. Create 'store_metadata' table 146 | ASCIIColors.info("Ensuring 'store_metadata' table exists...") 147 | cursor.execute(""" 148 | CREATE TABLE IF NOT EXISTS store_metadata (key TEXT PRIMARY KEY, value TEXT); 149 | """) 150 | ASCIIColors.green("'store_metadata' table ensured.") 151 | 152 | # 3. Create 'graph_nodes' table and indexes 153 | ASCIIColors.info("Ensuring 'graph_nodes' table and indexes...") 154 | cursor.execute(""" 155 | CREATE TABLE IF NOT EXISTS graph_nodes ( 156 | node_id INTEGER PRIMARY KEY AUTOINCREMENT, node_label TEXT NOT NULL, 157 | node_properties TEXT, unique_signature TEXT UNIQUE); 158 | """) 159 | cursor.execute("CREATE INDEX IF NOT EXISTS idx_graph_node_label ON graph_nodes (node_label);") 160 | cursor.execute("CREATE INDEX IF NOT EXISTS idx_graph_node_signature ON graph_nodes (unique_signature);") 161 | ASCIIColors.green("'graph_nodes' table and indexes ensured.") 162 | 163 | # 4. Create 'graph_relationships' table and indexes 164 | ASCIIColors.info("Ensuring 'graph_relationships' table and indexes...") 165 | cursor.execute(""" 166 | CREATE TABLE IF NOT EXISTS graph_relationships ( 167 | relationship_id INTEGER PRIMARY KEY AUTOINCREMENT, source_node_id INTEGER NOT NULL, 168 | target_node_id INTEGER NOT NULL, relationship_type TEXT NOT NULL, 169 | relationship_properties TEXT, 170 | FOREIGN KEY (source_node_id) REFERENCES graph_nodes (node_id) ON DELETE CASCADE, 171 | FOREIGN KEY (target_node_id) REFERENCES graph_nodes (node_id) ON DELETE CASCADE); 172 | """) 173 | cursor.execute("CREATE INDEX IF NOT EXISTS idx_graph_rel_source_type ON graph_relationships (source_node_id, relationship_type);") 174 | cursor.execute("CREATE INDEX IF NOT EXISTS idx_graph_rel_target_type ON graph_relationships (target_node_id, relationship_type);") 175 | cursor.execute("CREATE INDEX IF NOT EXISTS idx_graph_rel_type ON graph_relationships (relationship_type);") 176 | ASCIIColors.green("'graph_relationships' table and indexes ensured.") 177 | 178 | # 5. Create 'node_chunk_links' table and indexes 179 | ASCIIColors.info("Ensuring 'node_chunk_links' table and indexes...") 180 | cursor.execute(""" 181 | CREATE TABLE IF NOT EXISTS node_chunk_links ( 182 | node_id INTEGER NOT NULL, chunk_id INTEGER NOT NULL, 183 | FOREIGN KEY (node_id) REFERENCES graph_nodes (node_id) ON DELETE CASCADE, 184 | FOREIGN KEY (chunk_id) REFERENCES chunks (chunk_id) ON DELETE CASCADE, 185 | PRIMARY KEY (node_id, chunk_id)); 186 | """) 187 | cursor.execute("CREATE INDEX IF NOT EXISTS idx_ncl_node_id ON node_chunk_links (node_id);") 188 | cursor.execute("CREATE INDEX IF NOT EXISTS idx_ncl_chunk_id ON node_chunk_links (chunk_id);") 189 | ASCIIColors.green("'node_chunk_links' table and indexes ensured.") 190 | 191 | # 6. Update schema version in store_metadata 192 | ASCIIColors.info("Updating schema version to 2.0 in 'store_metadata'.") 193 | cursor.execute("INSERT OR REPLACE INTO store_metadata (key, value) VALUES (?, ?)", ('schema_version', '2.0')) 194 | 195 | cursor.execute("PRAGMA foreign_keys=ON;") 196 | 197 | conn.commit() 198 | ASCIIColors.success(f"Database migration to v2.0 completed successfully for: {db_path}") 199 | return True 200 | 201 | except sqlite3.Error as e: 202 | ASCIIColors.error(f"SQLite error during migration: {e}") 203 | if conn: 204 | ASCIIColors.warning("Rolling back changes due to error.") 205 | conn.rollback() 206 | return False 207 | except DatabaseError as e: 208 | ASCIIColors.error(f"Database operation error during migration: {e}") 209 | if conn: 210 | ASCIIColors.warning("Rolling back changes due to error.") 211 | conn.rollback() 212 | return False 213 | except Exception as e: 214 | ASCIIColors.error(f"An unexpected error occurred during migration: {e}", exc_info=True) 215 | if conn: 216 | ASCIIColors.warning("Rolling back changes due to error.") 217 | conn.rollback() 218 | return False 219 | finally: 220 | if conn: 221 | conn.close() 222 | ASCIIColors.debug("Database connection closed.") 223 | 224 | # --- main() function with argparse remains the same --- 225 | def main(): 226 | parser = argparse.ArgumentParser( 227 | description="Migrate SafeStore SQLite database from v1.0 schema to v2.0 schema.", 228 | formatter_class=argparse.RawTextHelpFormatter, 229 | epilog=""" 230 | Example usage: 231 | python %(prog)s /path/to/your/safestore.db 232 | python %(prog)s my_database.sqlite --yes 233 | 234 | This script adds new tables and columns for graph database functionality. 235 | It is designed to be run on a database created with a pre-graph version of SafeStore. 236 | Ensure you have a backup of your database before running this script. 237 | """ 238 | ) 239 | parser.add_argument( 240 | "db_path", 241 | type=Path, 242 | help="Path to the SQLite database file to migrate." 243 | ) 244 | parser.add_argument( 245 | "--yes", 246 | "-y", 247 | action="store_true", 248 | help="Automatically answer 'yes' to confirmation prompts (use with caution)." 249 | ) 250 | 251 | args = parser.parse_args() 252 | 253 | if migrate_v1_to_v2(args.db_path, auto_yes=args.yes): 254 | ASCIIColors.highlight("Migration process finished.") 255 | else: 256 | ASCIIColors.critical("Migration process failed or was aborted. Please check the logs.") 257 | exit(1) 258 | 259 | if __name__ == "__main__": 260 | main() 261 | -------------------------------------------------------------------------------- /examples/graph_usage.py: -------------------------------------------------------------------------------- 1 | # [FINAL & ROBUST] examples/graph_usage.py 2 | import safe_store 3 | from safe_store import GraphStore, LogLevel, SafeStore 4 | import pipmaster as pm 5 | 6 | pm.ensure_packages(["lollms_client"]) 7 | from lollms_client import LollmsClient 8 | from ascii_colors import ASCIIColors, trace_exception 9 | import sqlite3 10 | from pathlib import Path 11 | import json 12 | import shutil 13 | from typing import Dict, List, Any, Optional 14 | 15 | # --- Configuration --- 16 | DB_FILE = "graph_example_store.db" 17 | DOC_DIR = Path("temp_docs_graph_example") 18 | 19 | # --- LOLLMS Client Configuration --- 20 | BINDING_NAME = "ollama" 21 | HOST_ADDRESS = "http://localhost:11434" 22 | MODEL_NAME = "mistral:latest" 23 | 24 | # --- Ontology Definitions --- 25 | DETAILED_ONTOLOGY = { 26 | "nodes": { 27 | "Person": {"description": "A human individual.", "properties": {"name": "string", "title": "string"}}, 28 | "Company": {"description": "A commercial business.", "properties": {"name": "string", "location": "string"}}, 29 | "Product": {"description": "A product created by a company.", "properties": {"name": "string"}}, 30 | "ResearchPaper": {"description": "An academic publication.", "properties": {"title": "string"}}, 31 | "University": {"description": "An institution of higher education.", "properties": {"name": "string"}} 32 | }, 33 | "relationships": { 34 | "WORKS_AT": {"description": "Person is employed by Company.", "source": "Person", "target": "Company"}, 35 | "CEO_OF": {"description": "Person is the CEO of Company.", "source": "Person", "target": "Company"}, 36 | "FOUNDED": {"description": "Person founded a Company.", "source": "Person", "target": "Company"}, 37 | "COMPETITOR_OF": {"description": "Company is a competitor of another Company.", "source": "Company", "target": "Company"}, 38 | "PRODUCES": {"description": "Company creates a Product.", "source": "Company", "target": "Product"}, 39 | "AUTHOR_OF": {"description": "Person wrote a ResearchPaper.", "source": "Person", "target": "ResearchPaper"}, 40 | "AFFILIATED_WITH": {"description": "Person is associated with a University.", "source": "Person", "target": "University"} 41 | } 42 | } 43 | SIMPLE_ONTOLOGY = { 44 | "nodes": {"Entity": {"description": "A person, company, or organization.", "properties": {"name": "string"}}}, 45 | "relationships": {"IS_RELATED_TO": {"description": "Indicates a general connection between two entities.", "source": "Entity", "target": "Entity"}} 46 | } 47 | 48 | # NEW: Ontology as a simple string of instructions 49 | STRING_ONTOLOGY = """ 50 | - Extract People, Companies, and Products as nodes. 51 | - For 'People' nodes, extract their full name and any job title mentioned as properties. 52 | - For 'Companies' nodes, extract their full name and location as properties. 53 | - For 'Products' nodes, extract their name. 54 | - Create relationships like WORKS_AT, CEO_OF, and PRODUCES between these nodes. 55 | """ 56 | 57 | 58 | LC_CLIENT: Optional[LollmsClient] = None 59 | 60 | def initialize_lollms_client() -> bool: 61 | global LC_CLIENT 62 | if LC_CLIENT is None: 63 | ASCIIColors.info(f"Initializing LollmsClient: Binding='{BINDING_NAME}', Host='{HOST_ADDRESS}', Model='{MODEL_NAME}'") 64 | try: 65 | LC_CLIENT = LollmsClient(llm_binding_name=BINDING_NAME, llm_binding_config={"host_address": HOST_ADDRESS, "model_name": MODEL_NAME}) 66 | if not LC_CLIENT.llm: 67 | ASCIIColors.error(f"LollmsClient binding '{BINDING_NAME}' is not ready."); LC_CLIENT = None; return False 68 | ASCIIColors.success("LollmsClient initialized and ready.") 69 | return True 70 | except Exception as e: 71 | ASCIIColors.error(f"Failed to initialize LollmsClient: {e}"); trace_exception(e); LC_CLIENT = None; return False 72 | return True 73 | 74 | def llm_executor_callback(full_prompt: str) -> str: 75 | global LC_CLIENT 76 | if LC_CLIENT is None: raise ConnectionError("LollmsClient not initialized.") 77 | try: 78 | return LC_CLIENT.generate_code(full_prompt, language="json", temperature=0.05, top_k=10) 79 | except Exception as e: 80 | raise RuntimeError(f"LLM execution for JSON failed: {e}") from e 81 | 82 | def generate_answer_from_context(question: str, graph_data: Dict, chunks_data: Optional[List[Dict]] = None) -> str: 83 | global LC_CLIENT 84 | if LC_CLIENT is None: return "LLM not available." 85 | context_lines = ["--- CONTEXT ---"] 86 | if graph_data and graph_data.get("nodes"): 87 | context_lines.append("\n[Graph Information]:") 88 | node_map = {n['node_id']: n for n in graph_data['nodes']} 89 | 90 | def get_node_instance_name(node_id: int) -> str: 91 | """Helper to get the best possible name for a node instance.""" 92 | node = node_map.get(node_id) 93 | if not node: 94 | return f"ID:{node_id}" 95 | props = node.get('properties', {}) 96 | # Prioritize 'identifying_value', then 'name', then 'title' before falling back to ID. 97 | return props.get('identifying_value') or props.get('name') or props.get('title') or f"ID:{node_id}" 98 | 99 | for node in graph_data['nodes']: 100 | instance_name = get_node_instance_name(node['node_id']) 101 | context_lines.append(f"- Instance '{instance_name}' (type: {node['label']}): {json.dumps(node.get('properties', {}))}") 102 | 103 | for rel in graph_data.get('relationships', []): 104 | src_name = get_node_instance_name(rel['source_node_id']) 105 | tgt_name = get_node_instance_name(rel['target_node_id']) 106 | context_lines.append(f"- Relationship: '{src_name}' --[{rel['type']}]--> '{tgt_name}'") 107 | 108 | if chunks_data: 109 | context_lines.append("\n[Relevant Text Snippets]:") 110 | for i, chunk in enumerate(chunks_data): 111 | context_lines.append(f"- Snippet {i+1}: \"{chunk['chunk_text']}\"") 112 | context_lines.append("\n--- END OF CONTEXT ---") 113 | context_str = "\n".join(context_lines) 114 | 115 | prompt = (f"Answer the user's question based ONLY on the provided context. Do not use prior knowledge.\n\n" 116 | f"{context_str}\n\nQuestion: {question}") 117 | 118 | ASCIIColors.magenta("--- Sending Synthesis Prompt to LLM ---") 119 | try: 120 | return LC_CLIENT.generate_text(prompt, n_predict=512) 121 | except Exception as e: 122 | ASCIIColors.error(f"Error during answer synthesis: {e}") 123 | return "Error generating the answer." 124 | 125 | def print_header(title: str): 126 | print("\n" + "="*25 + f" {title} " + "="*25) 127 | 128 | def cleanup(): 129 | print_header("Cleaning Up Previous Run") 130 | paths = [Path(DB_FILE), Path(f"{DB_FILE}.lock"), Path(f"{DB_FILE}-wal"), Path(f"{DB_FILE}-shm"), DOC_DIR] 131 | for p in paths: 132 | try: 133 | if p.is_file(): p.unlink(missing_ok=True); print(f"- Removed file: {p}") 134 | elif p.is_dir(): shutil.rmtree(p, ignore_errors=True); print(f"- Removed directory: {p}") 135 | except OSError as e: print(f"- Warning: Could not remove {p}: {e}") 136 | 137 | def clear_graph_data(conn: sqlite3.Connection): 138 | ASCIIColors.warning("\nClearing all existing graph data from the database...") 139 | try: 140 | conn.execute("BEGIN") 141 | conn.execute("DELETE FROM node_chunk_links;") 142 | conn.execute("DELETE FROM graph_relationships;") 143 | conn.execute("DELETE FROM graph_nodes;") 144 | conn.execute("UPDATE chunks SET graph_processed_at = NULL;") 145 | conn.commit() 146 | ASCIIColors.success("Graph data cleared.") 147 | except sqlite3.Error as e: 148 | conn.rollback() 149 | ASCIIColors.error(f"Failed to clear graph data: {e}") 150 | 151 | if __name__ == "__main__": 152 | cleanup() 153 | if not initialize_lollms_client(): 154 | ASCIIColors.error("Exiting: LollmsClient initialization failure."); exit(1) 155 | 156 | ASCIIColors.set_log_level(LogLevel.INFO) 157 | 158 | try: 159 | print_header("Preparing Documents (One-time setup)") 160 | DOC_DIR.mkdir(exist_ok=True, parents=True) 161 | doc1_content = "Acme Innovations, led by CEO Dr. Evelyn Reed, is a tech company based in Silicon Valley. Their flagship product, 'NovaCore', was launched in 2023. John Doe works as a Senior Engineer at Acme Innovations and reports to Dr. Reed. Acme Innovations is a competitor of Beta Solutions." 162 | (DOC_DIR / "company_info.txt").write_text(doc1_content.strip(), encoding='utf-8') 163 | doc2_content = "The research paper 'Quantum Entanglement in Nanostructures' by Dr. Alice Smith cites work by Dr. Evelyn Reed on early quantum theories. Dr. Reed is also known for her work at Acme Innovations." 164 | (DOC_DIR / "research_paper_snippet.txt").write_text(doc2_content.strip(), encoding='utf-8') 165 | 166 | with SafeStore(db_path=DB_FILE) as store: 167 | store.add_document(DOC_DIR / "company_info.txt") 168 | store.add_document(DOC_DIR / "research_paper_snippet.txt") 169 | 170 | print_header("PASS 1: Building Graph with DETAILED Ontology") 171 | graph_store_detailed = GraphStore(store=store, llm_executor_callback=llm_executor_callback, ontology=DETAILED_ONTOLOGY) 172 | graph_store_detailed.build_graph_for_all_documents() 173 | ASCIIColors.success("Graph building with detailed ontology complete.") 174 | 175 | print_header("DEMO 1.1: RAG Query (Who is Dr. Evelyn Reed?)") 176 | query = "Who is Dr. Evelyn Reed and what companies is she associated with?" 177 | result = graph_store_detailed.query_graph(query, output_mode="full") 178 | full_answer = generate_answer_from_context(query, result.get('graph'), result.get('chunks')) 179 | ASCIIColors.green("Final Answer (from Graph + Chunks):") 180 | print(full_answer) 181 | 182 | print_header("DEMO 1.2: Manually Editing the Graph") 183 | ASCIIColors.info("We will manually add a new product 'ChronoLeap' and link it to an 'Acme' company.") 184 | 185 | company_nodes = graph_store_detailed.get_nodes_by_label("Company") 186 | acme_node = next((n for n in company_nodes if 'acme' in n.get('properties', {}).get('name', '').lower()), None) 187 | 188 | if acme_node: 189 | acme_id = acme_node['node_id'] 190 | acme_name = acme_node['properties']['name'] 191 | ASCIIColors.info(f"Found '{acme_name}' with Node ID: {acme_id}") 192 | 193 | product_id = graph_store_detailed.add_node(label="Product", properties={"name": "ChronoLeap"}) 194 | ASCIIColors.info(f"Created new 'ChronoLeap' product with Node ID: {product_id}") 195 | 196 | rel_id = graph_store_detailed.add_relationship(acme_id, product_id, "PRODUCES") 197 | ASCIIColors.info(f"Linked them with 'PRODUCES' relationship (ID: {rel_id})") 198 | 199 | print_header("DEMO 1.3: Querying the Manually Added Data") 200 | manual_query = "What new products does Acme produce?" 201 | manual_result = graph_store_detailed.query_graph(manual_query, output_mode="full") 202 | manual_answer = generate_answer_from_context(manual_query, manual_result.get('graph')) 203 | ASCIIColors.green("Final Answer (from Graph-Only):") 204 | print(manual_answer) 205 | else: 206 | ASCIIColors.warning("Could not find any 'Acme' company node to perform manual edit demo.") 207 | 208 | print_header("PASS 2: Rebuilding Graph with SIMPLE Ontology") 209 | clear_graph_data(store.conn) 210 | 211 | graph_store_simple = GraphStore(store=store, llm_executor_callback=llm_executor_callback, ontology=SIMPLE_ONTOLOGY) 212 | graph_store_simple.build_graph_for_all_documents() 213 | ASCIIColors.success("Graph building with simple ontology complete.") 214 | 215 | print_header("DEMO 2.1: Observing the new simple graph structure") 216 | simple_nodes = graph_store_simple.get_nodes_by_label("Entity", limit=10) 217 | ASCIIColors.blue("\nNodes extracted with the simple 'Entity' label:") 218 | if simple_nodes: 219 | for n in simple_nodes: print(f" - ID: {n['node_id']}, Props: {n.get('properties')}") 220 | else: 221 | print(" No 'Entity' nodes found.") 222 | 223 | print_header("PASS 3: Rebuilding Graph with STRING-BASED Ontology") 224 | clear_graph_data(store.conn) 225 | 226 | graph_store_string = GraphStore(store=store, llm_executor_callback=llm_executor_callback, ontology=STRING_ONTOLOGY) 227 | graph_store_string.build_graph_for_all_documents() 228 | ASCIIColors.success("Graph building with string-based ontology complete.") 229 | 230 | print_header("DEMO 3.1: Observing the graph from string ontology") 231 | string_nodes_viz = graph_store_string.get_all_nodes_for_visualization(limit=15) 232 | ASCIIColors.blue("\nNodes extracted with the string ontology:") 233 | if string_nodes_viz: 234 | for n in string_nodes_viz: print(f" - Label: {n['label']}, Props: {n.get('properties')}") 235 | else: 236 | print(" No nodes found.") 237 | 238 | 239 | except Exception as e: 240 | ASCIIColors.error(f"An unexpected error occurred in the main process: {e}") 241 | trace_exception(e) 242 | finally: 243 | print_header("Example Finished") 244 | ASCIIColors.info(f"Database file is at: {Path(DB_FILE).resolve()}") --------------------------------------------------------------------------------