├── LICENSE
├── safe_store
    ├── config.py
    ├── core
    │   ├── __init__.py
    │   ├── models.py
    │   └── exceptions.py
    ├── utils
    │   ├── __init__.py
    │   ├── concurrency.py
    │   └── json_parsing.py
    ├── depricated.py
    ├── indexing
    │   └── __init__.py
    ├── search
    │   ├── __init__.py
    │   └── similarity.py
    ├── security
    │   ├── __init__.py
    │   └── encryption.py
    ├── vectorization
    │   ├── __init__.py
    │   ├── methods
    │   │   ├── __init__.py
    │   │   ├── sentense_transformer
    │   │   │   ├── description.yaml
    │   │   │   └── __init__.py
    │   │   ├── ollama
    │   │   │   ├── description.yaml
    │   │   │   └── __init__.py
    │   │   ├── lollms
    │   │   │   ├── description.yaml
    │   │   │   └── __init__.py
    │   │   ├── openai
    │   │   │   └── description.yaml
    │   │   ├── tf_idf
    │   │   │   ├── description.yaml
    │   │   │   └── __init__.py
    │   │   └── cohere
    │   │   │   ├── description.yaml
    │   │   │   └── __init__.py
    │   ├── base.py
    │   ├── utils.py
    │   └── manager.py
    ├── graph
    │   ├── __init__.py
    │   └── prompts
    │   │   ├── entity_fusion_prompt.md
    │   │   ├── query_parsing_prompt.md
    │   │   ├── graph_extraction_prompt.md
    │   │   └── graph_extraction_prompt_with_ontology.md
    ├── __init__.py
    └── processing
    │   ├── tokenizers.py
    │   └── text_cleaning.py
├── examples
    ├── SafeStoreGraph
    │   ├── .gitignore
    │   ├── icon.png
    │   ├── requirements.txt
    │   └── description.yaml
    ├── requirements.txt
    ├── basic_usage_text.py
    ├── encryption_usage.py
    ├── dynamic_model_selection.py
    ├── custom_logging.py
    ├── metadata_generation.py
    ├── basic_usage.py
    └── graph_usage.py
├── tests
    ├── security
    │   ├── __init__.py
    │   └── test_encryption.py
    ├── fixtures
    │   ├── sample.docx
    │   ├── sample.pdf
    │   └── sample.html
    ├── test_chunking.py
    ├── test_store_phase4.py
    ├── conftest.py
    └── test_store_phase3.py
├── docs
    ├── requirements.txt
    ├── api.rst
    ├── index.rst
    ├── conf.py
    ├── installation.rst
    ├── logging.rst
    ├── quickstart.rst
    └── encryption.rst
├── temp_docs_point_cloud
    ├── animals.txt
    ├── tech.txt
    └── space.txt
├── point_cloud_web_app
    ├── data.json
    └── index.html
├── pyproject.toml
├── .gitignore
└── scripts
    └── migration_v1_v2.py


/LICENSE:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/safe_store/config.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/safe_store/core/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/safe_store/core/models.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/safe_store/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/safe_store/depricated.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/safe_store/indexing/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/safe_store/search/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/safe_store/security/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/safe_store/utils/concurrency.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/safe_store/vectorization/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/safe_store/vectorization/methods/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/examples/SafeStoreGraph/.gitignore:
--------------------------------------------------------------------------------
1 | projects
2 | config.json


--------------------------------------------------------------------------------
/tests/security/__init__.py:
--------------------------------------------------------------------------------
1 | # tests/security/__init__.py
2 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | Sphinx>=5.0
2 | sphinx-rtd-theme>=1.0
3 | # Add other Sphinx extensions if needed
4 | 


--------------------------------------------------------------------------------
/tests/fixtures/sample.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ParisNeo/safe_store/HEAD/tests/fixtures/sample.docx


--------------------------------------------------------------------------------
/tests/fixtures/sample.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ParisNeo/safe_store/HEAD/tests/fixtures/sample.pdf


--------------------------------------------------------------------------------
/examples/SafeStoreGraph/icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ParisNeo/safe_store/HEAD/examples/SafeStoreGraph/icon.png


--------------------------------------------------------------------------------
/temp_docs_point_cloud/animals.txt:
--------------------------------------------------------------------------------
1 | The quick brown fox jumps over the lazy dog. A fast red fox is athletic. The sleepy dog rests.


--------------------------------------------------------------------------------
/tests/fixtures/sample.html:
--------------------------------------------------------------------------------
1 | <!DOCTYPE html>
2 | <html>
3 | <body>
4 |     <p>This is HTML content.</p>
5 | </body>
6 | </html>
7 | 


--------------------------------------------------------------------------------
/safe_store/graph/__init__.py:
--------------------------------------------------------------------------------
1 | # safe_store/graph/__init__.py
2 | from .graph_store import GraphStore
3 | 
4 | __all__ = [
5 |     "GraphStore",
6 | ]


--------------------------------------------------------------------------------
/temp_docs_point_cloud/tech.txt:
--------------------------------------------------------------------------------
1 | Python is a versatile programming language. Many developers use Python for AI. RAG pipelines are a common use case.


--------------------------------------------------------------------------------
/examples/SafeStoreGraph/requirements.txt:
--------------------------------------------------------------------------------
1 | fastapi
2 | uvicorn[standard]
3 | python-multipart
4 | lollms_client>=1.3.0
5 | safe_store>=2.7.0
6 | pipmaster


--------------------------------------------------------------------------------
/temp_docs_point_cloud/space.txt:
--------------------------------------------------------------------------------
1 | The sun is a star at the center of our solar system. The Earth revolves around the sun. Space exploration is fascinating.


--------------------------------------------------------------------------------
/examples/requirements.txt:
--------------------------------------------------------------------------------
1 | safe-store
2 | extract-msg
3 | beautifulsoup4
4 | sentence-transformers
5 | scikit-learn
6 | pandas
7 | plotly
8 | lollms_client
9 | 


--------------------------------------------------------------------------------
/examples/SafeStoreGraph/description.yaml:
--------------------------------------------------------------------------------
 1 | author: ParisNeo & Lollms
 2 | category: Data
 3 | creation_date: '2025-08-18T10:05:00.000000'
 4 | description: This application allows the user to upload files and convert them to a Graph with an optional ontology. The user can then query the graph using natural language questions. The application uses SafeStore for graph storage and retrieval, and Gemini Pro as the LLM for processing and answering questions.
 5 | disclaimer: null
 6 | last_update_date: '2025-08-18T10:05:00.000000'
 7 | model: Gemini Pro
 8 | name: SafeStoreGraph
 9 | version: 1.0
10 | 


--------------------------------------------------------------------------------
/safe_store/vectorization/methods/sentense_transformer/description.yaml:
--------------------------------------------------------------------------------
 1 | title: Sentence Transformers Vectorizer
 2 | author: ParisNeo
 3 | creation_date: 2025-10-10
 4 | last_update_date: 2025-10-10
 5 | class_name: STVectorizer
 6 | description: >
 7 |   A local vectorizer that uses models from the sentence-transformers library.
 8 |   Models are downloaded and run directly on your machine.
 9 | input_parameters:
10 |   - name: model
11 |     type: str
12 |     description: "The name of the Sentence Transformer model to use from the Hugging Face Hub."
13 |     mandatory: true
14 |     default: "all-MiniLM-L6-v2"


--------------------------------------------------------------------------------
/safe_store/vectorization/methods/ollama/description.yaml:
--------------------------------------------------------------------------------
 1 | title: Ollama Vectorizer
 2 | author: ParisNeo
 3 | creation_date: 2025-10-10
 4 | last_update_date: 2025-10-10
 5 | class_name: OllamaVectorizer
 6 | description: >
 7 |   A vectorizer that uses a local Ollama instance to generate text embeddings.
 8 |   Requires a running Ollama server.
 9 | input_parameters:
10 |   - name: model
11 |     type: str
12 |     description: "The name of the embedding model to use from your Ollama server (e.g., 'nomic-embed-text')."
13 |     mandatory: true
14 |     default: ""
15 |   - name: host
16 |     type: str
17 |     description: "The URL of the Ollama server. If not provided, it defaults to http://localhost:11434 or the OLLAMA_HOST environment variable."
18 |     mandatory: false
19 |     default: ""


--------------------------------------------------------------------------------
/safe_store/vectorization/methods/lollms/description.yaml:
--------------------------------------------------------------------------------
 1 | title: Lollms Vectorizer
 2 | author: ParisNeo
 3 | creation_date: 2025-10-10
 4 | last_update_date: 2025-10-10
 5 | class_name: LollmsVectorizer
 6 | description: >
 7 |   A vectorizer that connects to any OpenAI-compatible API, such as a local Lollms
 8 |   instance, for generating embeddings.
 9 | input_parameters:
10 |   - name: model
11 |     type: str
12 |     description: "The name of the embedding model served by the Lollms instance."
13 |     mandatory: true
14 |     default: "nomic-embed-text"
15 |   - name: base_url
16 |     type: str
17 |     description: "The base URL of the OpenAI-compatible API endpoint."
18 |     mandatory: true
19 |     default: "http://localhost:9600"
20 |   - name: api_key
21 |     type: str
22 |     description: "The API key for the service. Often not required for local instances."
23 |     mandatory: false
24 |     default: "not_needed"


--------------------------------------------------------------------------------
/safe_store/vectorization/methods/openai/description.yaml:
--------------------------------------------------------------------------------
 1 | title: OpenAI Vectorizer
 2 | author: ParisNeo
 3 | creation_date: 2025-10-10
 4 | last_update_date: 2025-10-10
 5 | class_name: OpenAIVectorizer
 6 | description: >
 7 |   A vectorizer that uses OpenAI's API to generate text embeddings.
 8 |   Requires an OpenAI API key.
 9 | input_parameters:
10 |   - name: model
11 |     type: str
12 |     description: "The name of the OpenAI embedding model to use."
13 |     mandatory: true
14 |     default: "text-embedding-3-small"
15 |   - name: api_key
16 |     type: str
17 |     description: "Your OpenAI API key. If not provided, the OPENAI_API_KEY environment variable will be used."
18 |     mandatory: false
19 |     default: ""
20 |   - name: base_url
21 |     type: str
22 |     description: "Optional custom base URL for the OpenAI API, for use with proxies or other compatible services."
23 |     mandatory: false
24 |     default: ""


--------------------------------------------------------------------------------
/safe_store/vectorization/methods/tf_idf/description.yaml:
--------------------------------------------------------------------------------
 1 | title: TF-IDF Vectorizer
 2 | author: ParisNeo
 3 | creation_date: 2025-10-10
 4 | last_update_date: 2025-10-10
 5 | class_name: TfidfVectorizerWrapper
 6 | description: >
 7 |   A classic, local vectorizer based on Term Frequency-Inverse Document Frequency.
 8 |   This vectorizer must be 'fit' on your data, so its performance is data-dependent.
 9 |   It does not capture semantic meaning like deep learning models.
10 | input_parameters:
11 |   - name: name
12 |     type: str
13 |     description: "A unique name to identify this specific fitted TF-IDF model within the database."
14 |     mandatory: true
15 |     default: "default_tfidf"
16 |   - name: params
17 |     type: dict
18 |     description: "Optional dictionary of parameters to pass to the underlying scikit-learn TfidfVectorizer, such as 'ngram_range' or 'max_features'."
19 |     mandatory: false
20 |     default: {}


--------------------------------------------------------------------------------
/safe_store/graph/prompts/entity_fusion_prompt.md:
--------------------------------------------------------------------------------
 1 | # [NEW & COMPLETE] prompts/entity_fusion_prompt.md
 2 | Your task is to determine if two entities of the same type are, in fact, the same entity based on their properties.
 3 | 
 4 | **Entity Type:** {entity_label}
 5 | 
 6 | ---
 7 | 
 8 | **Entity A Properties:**
 9 | ```json
10 | {node_a_properties}
11 | ```
12 | 
13 | ---
14 | 
15 | **Entity B Properties:**
16 | ```json
17 | {node_b_properties}
18 | ```
19 | 
20 | ---
21 | 
22 | **Analysis:**
23 | Carefully compare the properties of Entity A and Entity B. Do they refer to the same real-world entity? Consider variations in naming, partial information, or different levels of detail.
24 | 
25 | **Output Format:**
26 | You MUST respond with only a single, well-formed JSON object in a markdown code block. The JSON object must have two keys:
27 | 1.  `"is_same"`: A boolean (`true` or `false`).
28 | 2.  `"reasoning"`: A brief, one-sentence explanation for your decision.
29 | 
30 | **Example Response:**
31 | ```json
32 | {{
33 |     "is_same": true,
34 |     "reasoning": "Both entities share the same unique identifier and have highly similar descriptive properties."
35 | }}
36 | ```


--------------------------------------------------------------------------------
/docs/api.rst:
--------------------------------------------------------------------------------
 1 | ===
 2 | API
 3 | ===
 4 | 
 5 | This section provides detailed documentation for the ``safe_store`` library's public API.
 6 | 
 7 | Core Class
 8 | ----------
 9 | 
10 | .. automodule:: SafeStore.store
11 |    :members: SafeStore, LogLevel
12 |    :undoc-members:
13 |    :show-inheritance:
14 | 
15 | Exceptions
16 | ----------
17 | 
18 | .. automodule:: SafeStore.core.exceptions
19 |    :members:
20 |    :undoc-members:
21 |    :show-inheritance:
22 | 
23 | Vectorizers
24 | -----------
25 | 
26 | .. automodule:: SafeStore.vectorization.base
27 |    :members: BaseVectorizer
28 |    :undoc-members:
29 | 
30 | .. automodule:: SafeStore.vectorization.methods.sentence_transformer
31 |    :members: SentenceTransformerVectorizer
32 |    :undoc-members:
33 | 
34 | .. automodule:: SafeStore.vectorization.methods.tfidf
35 |    :members: TfidfVectorizerWrapper
36 |    :undoc-members:
37 | 
38 | Utilities
39 | ---------
40 | While primarily used internally, the ``ascii_colors`` library is exposed for configuration.
41 | 
42 | .. automodule:: ascii_colors
43 |    :members: ASCIIColors, LogLevel, FileHandler, Formatter, JSONFormatter
44 |    :undoc-members:
45 | 
46 | (Add other modules/classes as needed)
47 | 


--------------------------------------------------------------------------------
/safe_store/vectorization/methods/cohere/description.yaml:
--------------------------------------------------------------------------------
 1 | title: Cohere Vectorizer
 2 | author: ParisNeo
 3 | creation_date: 2025-10-10
 4 | last_update_date: 2025-10-10
 5 | class_name: CohereVectorizer
 6 | description: >
 7 |   A vectorizer that uses Cohere's API to generate text embeddings.
 8 |   Requires a Cohere API key, which can be provided via the 'api_key' parameter
 9 |   or the COHERE_API_KEY environment variable.
10 | input_parameters:
11 |   - name: model
12 |     type: str
13 |     description: "The name of the Cohere embedding model to use."
14 |     mandatory: true
15 |     default: "embed-english-v3.0"
16 |   - name: api_key
17 |     type: str
18 |     description: "Your Cohere API key. If not provided, the COHERE_API_KEY environment variable will be used."
19 |     mandatory: false
20 |     default: ""
21 |   - name: input_type
22 |     type: str
23 |     description: "The type of input being embedded, e.g., 'search_document' or 'search_query'."
24 |     mandatory: false
25 |     default: "search_document"
26 |   - name: truncate
27 |     type: str
28 |     description: "The truncation strategy for inputs longer than the model's context window ('NONE', 'START', 'END')."
29 |     mandatory: false
30 |     default: "END"


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. safe_store documentation master file, created by
 2 |    sphinx-quickstart on <date>.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to safe_store's documentation!
 7 | =====================================
 8 | 
 9 | **safe_store** is a Python library providing a lightweight, file-based vector database using SQLite. It's designed for simplicity and efficiency, making it ideal for integrating into local Retrieval-Augmented Generation (RAG) pipelines.
10 | 
11 | Key Features:
12 | 
13 | *   **Local SQLite Backend:** Simple, single-file database.
14 | *   **Concurrency Safe:** Handles multiple processes writing via file locks.
15 | *   **Multiple Vectorizers:** Supports Sentence Transformers, TF-IDF, etc.
16 | *   **Document Parsing:** Handles `.txt`, `.pdf`, `.docx`, `.html`.
17 | *   **Optional Encryption:** Securely store chunk text at rest.
18 | *   **Informative Logging:** Clear console output via `ascii_colors`.
19 | 
20 | .. toctree::
21 |    :maxdepth: 2
22 |    :caption: Contents:
23 | 
24 |    installation
25 |    quickstart
26 |    logging
27 |    encryption
28 |    api
29 | 
30 | 
31 | Indices and tables
32 | ==================
33 | 
34 | * :ref:`genindex`
35 | * :ref:`modindex`
36 | * :ref:`search`
37 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | # docs/conf.py
 2 | import os
 3 | import sys
 4 | sys.path.insert(0, os.path.abspath('..')) # Add project root to path
 5 | import safe_store # Import your package
 6 | 
 7 | project = 'safe_store'
 8 | copyright = '2025, ParisNeo' # Update year/author
 9 | author = 'ParisNeo'
10 | 
11 | # Get version from package
12 | release = safe_store.__version__
13 | 
14 | extensions = [
15 |     'sphinx.ext.autodoc',
16 |     'sphinx.ext.napoleon', # For Google/NumPy style docstrings
17 |     'sphinx.ext.intersphinx',
18 |     'sphinx.ext.viewcode',
19 |     'sphinx_rtd_theme',
20 | ]
21 | 
22 | templates_path = ['_templates']
23 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
24 | 
25 | html_theme = 'sphinx_rtd_theme'
26 | html_static_path = ['_static']
27 | 
28 | # Autodoc settings
29 | autodoc_member_order = 'bysource'
30 | autodoc_default_options = {
31 |     'members': True,
32 |     'undoc-members': True,
33 |     'private-members': False,
34 |     'special-members': '__init__', # Include __init__ methods
35 |     'show-inheritance': True,
36 | }
37 | 
38 | # Intersphinx settings
39 | intersphinx_mapping = {
40 |     'python': ('https://docs.python.org/3', None),
41 |     'numpy': ('https://numpy.org/doc/stable/', None),
42 |     'sklearn': ('https://scikit-learn.org/stable/', None),
43 |     # Add others if needed (e.g., cryptography, sentence-transformers)
44 | }
45 | 


--------------------------------------------------------------------------------
/docs/installation.rst:
--------------------------------------------------------------------------------
 1 | ============
 2 | Installation
 3 | ============
 4 | 
 5 | Install ``safe_store`` using pip:
 6 | 
 7 | .. code-block:: bash
 8 | 
 9 |    pip install safe_store
10 | 
11 | Optional Dependencies
12 | ---------------------
13 | 
14 | ``safe_store`` uses optional dependencies for certain features like specific vectorizers or document parsers. You can install these extras as needed:
15 | 
16 | *   **Sentence Transformers:** For state-of-the-art sentence embeddings.
17 |     .. code-block:: bash
18 | 
19 |        pip install safe_store[sentence-transformers]
20 | 
21 | *   **TF-IDF:** For classic TF-IDF vectorization (requires scikit-learn).
22 |     .. code-block:: bash
23 | 
24 |        pip install safe_store[tfidf]
25 | 
26 | *   **Document Parsing:** For handling ``.pdf``, ``.docx``, and ``.html`` files.
27 |     .. code-block:: bash
28 | 
29 |        pip install safe_store[parsing]
30 | 
31 | *   **Encryption:** For encrypting chunk text at rest (requires cryptography).
32 |     .. code-block:: bash
33 | 
34 |        pip install safe_store[encryption]
35 | 
36 | *   **All Features:** To install all optional dependencies at once.
37 |     .. code-block:: bash
38 | 
39 |        pip install safe_store[all]
40 | 
41 | *   **Development:** To install dependencies needed for testing, building, and documentation generation.
42 |     .. code-block:: bash
43 | 
44 |        pip install safe_store[dev]
45 | 


--------------------------------------------------------------------------------
/safe_store/vectorization/base.py:
--------------------------------------------------------------------------------
 1 | # safe_store/vectorization/base.py
 2 | from abc import ABC, abstractmethod
 3 | import numpy as np
 4 | from typing import List, Optional, Any
 5 | 
 6 | class BaseVectorizer(ABC):
 7 |     """
 8 |     Abstract base class for all vectorizer implementations within safe_store.
 9 |     """
10 | 
11 |     def __init__(self, vectorizer_name:str="unknown"):
12 |         self.vectorizer_name = vectorizer_name
13 | 
14 |     @abstractmethod
15 |     def vectorize(self, texts: List[str]) -> np.ndarray:
16 |         """Converts a list of text documents into a NumPy array of vector embeddings."""
17 |         pass
18 | 
19 |     @property
20 |     @abstractmethod
21 |     def dim(self) -> Optional[int]:
22 |         """The dimension of the vectors produced by this vectorizer."""
23 |         pass
24 | 
25 |     @property
26 |     @abstractmethod
27 |     def dtype(self) -> np.dtype:
28 |         """The NumPy data type of the vector embeddings."""
29 |         pass
30 | 
31 |     def get_tokenizer(self) -> Optional[Any]:
32 |         """
33 |         Returns the tokenizer associated with the vectorizer, if available.
34 | 
35 |         The returned tokenizer should have `encode` and `decode` methods
36 |         compatible with libraries like Hugging Face's tokenizers.
37 | 
38 |         Returns:
39 |             A tokenizer object or None if no tokenizer is available client-side.
40 |         """
41 |         return None
42 | 
43 |     @staticmethod
44 |     def list_models(**kwargs) -> List[str]:
45 |         """
46 |         Lists the available models for this vectorizer.
47 |         This method should be overridden by subclasses that support model listing.
48 |         """
49 |         return []


--------------------------------------------------------------------------------
/safe_store/vectorization/utils.py:
--------------------------------------------------------------------------------
 1 | # safe_store/vectorization/utils.py
 2 | import importlib.util
 3 | from pathlib import Path
 4 | from typing import Any
 5 | from ..core.exceptions import ConfigurationError
 6 | 
 7 | def load_vectorizer_module(vectorizer_name: str, custom_vectorizers_path: str = None) -> Any:
 8 |     """Dynamically loads a vectorizer module from built-in methods or a custom path."""
 9 |     
10 |     # First, try loading from the custom path if provided
11 |     if custom_vectorizers_path:
12 |         custom_path = Path(custom_vectorizers_path) / vectorizer_name / "__init__.py"
13 |         if custom_path.exists():
14 |             try:
15 |                 spec = importlib.util.spec_from_file_location(f"custom_vectorizers.{vectorizer_name}", custom_path)
16 |                 if spec and spec.loader:
17 |                     module = importlib.util.module_from_spec(spec)
18 |                     spec.loader.exec_module(module)
19 |                     return module
20 |             except Exception as e:
21 |                  raise ConfigurationError(f"Failed to load custom vectorizer '{vectorizer_name}' from {custom_path}: {e}") from e
22 |     
23 |     # If not in custom, try built-in methods
24 |     builtin_path = Path(__file__).parent / "methods" / vectorizer_name / "__init__.py"
25 |     if builtin_path.exists():
26 |         try:
27 |             module_name = f"safe_store.vectorization.methods.{vectorizer_name}"
28 |             return importlib.import_module(module_name)
29 |         except Exception as e:
30 |             raise ConfigurationError(f"Failed to load built-in vectorizer '{vectorizer_name}': {e}") from e
31 |             
32 |     raise FileNotFoundError(f"Vectorizer module '{vectorizer_name}' not found in built-in methods or custom path.")


--------------------------------------------------------------------------------
/safe_store/graph/prompts/query_parsing_prompt.md:
--------------------------------------------------------------------------------
 1 | Parse the following query to identify main entities ("seed_nodes").
 2 | Format the output STRICTLY as a JSON object.
 3 | **The entire JSON output MUST be enclosed in a single markdown code block starting with ```json and ending with ```.**
 4 | 
 5 | JSON structure:
 6 | ```json
 7 | {{
 8 |     "seed_nodes": [
 9 |         {{"label": "EntityType", "identifying_property_key": "property_name", "identifying_property_value": "property_value"}}
10 |     ],
11 |     "target_relationships": [ {{"type": "REL_TYPE", "direction": "outgoing|incoming|any"}} ],
12 |     "target_node_labels": ["Label1", "Label2"],
13 |     "max_depth": 1
14 | }}```
15 | - "seed_nodes": List of main entities from the query.
16 |     - "label": The type of the entity.
17 |     - "identifying_property_key": The name of the property that identifies the entity (e.g., "name", "title").
18 |     - "identifying_property_value": The value of that identifying property.
19 | - "target_relationships" (Optional): Desired relationship types and directions.
20 | - "target_node_labels" (Optional): Desired types of neighbor nodes.
21 | - "max_depth" (Optional, default 1): Traversal depth.
22 | 
23 | Example Query: "Who is Evelyn Reed and what companies is she associated with?"
24 | Example JSON (wrapped in ```json ... ```):
25 | ```json
26 | {{
27 |     "seed_nodes": [ {{"label": "Person", "identifying_property_key": "name", "identifying_property_value": "Evelyn Reed"}} ],
28 |     "target_relationships": [ {{"type": "WORKS_AT", "direction": "any"}}, {{"type": "CEO_OF", "direction": "any"}} ],
29 |     "target_node_labels": ["Company", "Organization"],
30 |     "max_depth": 1
31 | }}
32 | ```
33 | 
34 | If no clear entities, return `{{ "seed_nodes": [] }}`.
35 | 
36 | Query: --- {natural_language_query} --- Parsed JSON Query (wrapped in ```json ... ```):


--------------------------------------------------------------------------------
/safe_store/vectorization/methods/tf_idf/__init__.py:
--------------------------------------------------------------------------------
 1 | # safe_store/vectorization/methods/tf_idf/__init__.py
 2 | import numpy as np
 3 | from typing import List, Optional, Dict, Any
 4 | import pickle
 5 | from safe_store.vectorization.base import BaseVectorizer
 6 | from safe_store.core.exceptions import ConfigurationError, VectorizationError
 7 | import pipmaster as pm
 8 | 
 9 | class_name = "TfIdfVectorizer"
10 | 
11 | class TfIdfVectorizer(BaseVectorizer):
12 |     def __init__(self, model_config: Dict[str, Any], cache_folder: Optional[str] = None):
13 |         super().__init__("tfidf")
14 |         pm.ensure_packages(["scikit-learn"])
15 |         from sklearn.feature_extraction.text import TfidfVectorizer as SklearnTfidfVectorizer
16 |         
17 |         self.vectorizer = SklearnTfidfVectorizer()
18 |         self._fitted = False
19 |         self._dim = None
20 | 
21 |     def fit(self, texts: List[str]):
22 |         self.vectorizer.fit(texts)
23 |         self._fitted = True
24 |         self._dim = len(self.vectorizer.get_feature_names_out())
25 | 
26 |     def vectorize(self, texts: List[str]) -> np.ndarray:
27 |         if not self._fitted:
28 |             raise VectorizationError("TF-IDF vectorizer must be fitted before vectorizing.")
29 |         return self.vectorizer.transform(texts).toarray().astype(np.float32)
30 | 
31 |     @property
32 |     def dim(self) -> Optional[int]:
33 |         return self._dim
34 | 
35 |     @property
36 |     def dtype(self) -> np.dtype:
37 |         return np.float32
38 | 
39 |     def get_params_to_store(self) -> Dict[str, Any]:
40 |         return {"vectorizer_pickle": pickle.dumps(self.vectorizer)}
41 | 
42 |     @staticmethod
43 |     def list_models(**kwargs) -> List[str]:
44 |         """TF-IDF is a data-dependent model, not a pre-trained one. It has one 'model' type."""
45 |         return ["tfidf"]


--------------------------------------------------------------------------------
/tests/test_chunking.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from safe_store.indexing.chunking import chunk_text
 3 | 
 4 | def test_chunk_simple():
 5 |     text = "abcdefghijklmnopqrstuvwxyz"
 6 |     chunks = chunk_text(text, chunk_size=10, chunk_overlap=3)
 7 |     # Expected:
 8 |     # abcdefghij (0, 10)
 9 |     # hijklmnopq (7, 17)
10 |     # opqrstuvwx (14, 24)
11 |     # uvwxyz     (21, 26)
12 |     assert len(chunks) == 4
13 |     assert chunks[0] == ("abcdefghij", 0, 10)
14 |     assert chunks[1] == ("hijklmnopq", 7, 17)
15 |     assert chunks[2] == ("opqrstuvwx", 14, 24)
16 |     assert chunks[3] == ('vwxyz', 21, 26) 
17 | 
18 | def test_chunk_no_overlap():
19 |     text = "abcde fghij klmno"
20 |     chunks = chunk_text(text, chunk_size=5, chunk_overlap=0)
21 |     assert len(chunks) == 4
22 |     assert chunks[0] == ("abcde", 0, 5)
23 |     assert chunks[1] == (" fghi", 5, 10) # Note space included
24 |     assert chunks[2] == ("j klm", 10, 15)
25 |     assert chunks[3] == ("no", 15, 17)
26 | 
27 | 
28 | def test_chunk_large_overlap_error():
29 |      with pytest.raises(ValueError):
30 |          chunk_text("abc", chunk_size=5, chunk_overlap=5)
31 | 
32 | def test_chunk_smaller_than_size():
33 |      text = "short"
34 |      chunks = chunk_text(text, chunk_size=10, chunk_overlap=2)
35 |      assert len(chunks) == 1
36 |      assert chunks[0] == ("short", 0, 5)
37 | 
38 | def test_chunk_exact_size():
39 |      text = "exactsize!" # 10 chars
40 |      chunks = chunk_text(text, chunk_size=10, chunk_overlap=2)
41 |      assert len(chunks) == 1
42 |      assert chunks[0] == ("exactsize!", 0, 10)
43 | 
44 | def test_chunk_edge_case_overlap():
45 |      # Test where overlap calculation might stall if not handled
46 |      text = "1234567890"
47 |      chunks = chunk_text(text, chunk_size=5, chunk_overlap=4)
48 |      # 12345 (0, 5)
49 |      # 23456 (1, 6)
50 |      # 34567 (2, 7)
51 |      # ...
52 |      # 67890 (5, 10)
53 |      assert len(chunks) == 6
54 |      assert chunks[0] == ("12345", 0, 5)
55 |      assert chunks[-1] == ("67890", 5, 10)


--------------------------------------------------------------------------------
/safe_store/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | safe_store: Simple SQLite Vector Store for RAG.
 3 | 
 4 | A Python utility library providing a lightweight, efficient, and file-based
 5 | vector database using SQLite. Optimized for easy integration into
 6 | Retrieval-Augmented Generation (RAG) pipelines for Large Language Models (LLMs).
 7 | Includes optional encryption, concurrency control, and graph data capabilities.
 8 | """
 9 | 
10 | from .store import SafeStore, LogLevel, TEMP_FILE_DB_INDICATOR, IN_MEMORY_DB_INDICATOR, DEFAULT_LOCK_TIMEOUT
11 | from .graph.graph_store import GraphStore
12 | from .core.exceptions import ( # Expose exceptions for users
13 |     SafeStoreError,
14 |     DatabaseError,
15 |     FileHandlingError,
16 |     ParsingError,
17 |     IndexingError,  
18 |     VectorizationError,
19 |     QueryError,
20 |     ConfigurationError,
21 |     ConcurrencyError,
22 |     EncryptionError,
23 |     # Graph specific exceptions
24 |     GraphError,
25 |     GraphDBError,
26 |     GraphProcessingError,
27 |     LLMCallbackError,
28 | )
29 | from .indexing.parser import SAFE_STORE_SUPPORTED_FILE_EXTENSIONS, parse_document 
30 | from .processing.text_cleaning import basic_text_cleaner # Expose the basic cleaner as a utility
31 | from ascii_colors import ASCIIColors # Expose for user configuration convenience
32 | 
33 | __version__ = "3.3.2" # Version bump to reflect API changes
34 | 
35 | __all__ = [
36 |     "SafeStore",
37 |     "GraphStore",
38 |     "ASCIIColors",
39 |     "LogLevel",
40 |     # Exceptions
41 |     "SafeStoreError",
42 |     "DatabaseError",
43 |     "FileHandlingError",
44 |     "ParsingError",
45 |     "IndexingError",
46 |     "VectorizationError",
47 |     "QueryError",
48 |     "ConfigurationError",
49 |     "ConcurrencyError",
50 |     "EncryptionError",
51 |     "GraphError",
52 |     "GraphDBError",
53 |     "GraphProcessingError",
54 |     "LLMCallbackError",
55 |     # globals
56 |     "SAFE_STORE_SUPPORTED_FILE_EXTENSIONS",
57 |     "TEMP_FILE_DB_INDICATOR",
58 |     "IN_MEMORY_DB_INDICATOR",
59 |     "DEFAULT_LOCK_TIMEOUT",
60 |     # utilities
61 |     "parse_document",
62 |     "basic_text_cleaner"
63 | ]


--------------------------------------------------------------------------------
/safe_store/graph/prompts/graph_extraction_prompt.md:
--------------------------------------------------------------------------------
 1 | **CRITICAL INSTRUCTION: You are a data extraction expert. Your task is to extract entities (nodes) and relationships from the provided text, strictly adhering to the ontology schema below.**
 2 | 
 3 | - **ONLY** extract nodes whose `label` is explicitly defined in the "NODE LABELS" section of the ontology.
 4 | - For each extracted node, **ONLY** include properties that are listed for that specific label in the ontology. Be exhaustive and extract every property defined in the ontology that is present in the text.
 5 | - **ONLY** create relationships where the `type` is explicitly defined in the "RELATIONSHIP TYPES" section.
 6 | - You **MUST** respect the `Source` and `Target` constraints for relationships if they are specified.
 7 | - If an entity or relationship in the text does not fit the ontology, **DO NOT** extract it.
 8 | - Every node's `properties` object **MUST** contain an `identifying_value`. This is a unique name or identifier for the entity (e.g., "John Doe", "Acme Corporation") and is used to link relationships.
 9 | - Format the output as a single JSON object inside a markdown code block.
10 | 
11 | **User Guidance (Follow these additional instructions within the ontology's constraints):**
12 | {user_guidance}
13 | ---
14 | 
15 | **Text to process:**
16 | {chunk_text}
17 | ---
18 | 
19 | **JSON Output Structure (Populate this structure according to the rules):**
20 | ```json
21 | {{
22 |     "nodes": [
23 |         {{
24 |             "label": "LabelFromOntology",
25 |             "properties": {{
26 |                 "identifying_value": "A unique value for this entity (MANDATORY)",
27 |                 "property_from_ontology": "Value from text",
28 |                 "...": "..."
29 |             }}
30 |         }}
31 |     ],
32 |     "relationships": [
33 |         {{
34 |             "source_node_label": "SourceLabelFromOntology",
35 |             "source_node_identifying_value": "Identifier of the source node",
36 |             "target_node_label": "TargetLabelFromOntology",
37 |             "target_node_identifying_value": "Identifier of the target node",
38 |             "type": "RelationshipTypeFromOntology",
39 |             "properties": {{
40 |                 "role": "A role or description if applicable"
41 |             }}
42 |         }}
43 |     ]
44 | }}
45 | ```


--------------------------------------------------------------------------------
/point_cloud_web_app/data.json:
--------------------------------------------------------------------------------
1 | [{"x": -0.38227444887161255, "y": -0.5533952116966248, "chunk_id": 1, "document_title": "animals.txt", "document_path": "C:\\Users\\aloui\\Documents\\ai\\safe_store\\temp_docs_point_cloud\\animals.txt", "metadata": {"topic": "animals", "source": "fiction"}}, {"x": -0.38085994124412537, "y": -0.5823900103569031, "chunk_id": 2, "document_title": "animals.txt", "document_path": "C:\\Users\\aloui\\Documents\\ai\\safe_store\\temp_docs_point_cloud\\animals.txt", "metadata": {"topic": "animals", "source": "fiction"}}, {"x": -0.3778142035007477, "y": -0.40828418731689453, "chunk_id": 3, "document_title": "animals.txt", "document_path": "C:\\Users\\aloui\\Documents\\ai\\safe_store\\temp_docs_point_cloud\\animals.txt", "metadata": {"topic": "animals", "source": "fiction"}}, {"x": 0.5525932908058167, "y": -0.06589919328689575, "chunk_id": 4, "document_title": "tech.txt", "document_path": "C:\\Users\\aloui\\Documents\\ai\\safe_store\\temp_docs_point_cloud\\tech.txt", "metadata": {"topic": "technology", "source": "documentation"}}, {"x": 0.7413166165351868, "y": -0.11253535002470016, "chunk_id": 5, "document_title": "tech.txt", "document_path": "C:\\Users\\aloui\\Documents\\ai\\safe_store\\temp_docs_point_cloud\\tech.txt", "metadata": {"topic": "technology", "source": "documentation"}}, {"x": 0.6865788698196411, "y": 0.0834011361002922, "chunk_id": 6, "document_title": "tech.txt", "document_path": "C:\\Users\\aloui\\Documents\\ai\\safe_store\\temp_docs_point_cloud\\tech.txt", "metadata": {"topic": "technology", "source": "documentation"}}, {"x": -0.47032028436660767, "y": 0.5269608497619629, "chunk_id": 7, "document_title": "space.txt", "document_path": "C:\\Users\\aloui\\Documents\\ai\\safe_store\\temp_docs_point_cloud\\space.txt", "metadata": {"topic": "space", "source": "science"}}, {"x": -0.38039878010749817, "y": 0.6285385489463806, "chunk_id": 8, "document_title": "space.txt", "document_path": "C:\\Users\\aloui\\Documents\\ai\\safe_store\\temp_docs_point_cloud\\space.txt", "metadata": {"topic": "space", "source": "science"}}, {"x": -0.09704269468784332, "y": 0.44058260321617126, "chunk_id": 9, "document_title": "space.txt", "document_path": "C:\\Users\\aloui\\Documents\\ai\\safe_store\\temp_docs_point_cloud\\space.txt", "metadata": {"topic": "space", "source": "science"}}, {"x": 0.10822149366140366, "y": 0.04302079603075981, "chunk_id": 10, "document_title": "space.txt", "document_path": "C:\\Users\\aloui\\Documents\\ai\\safe_store\\temp_docs_point_cloud\\space.txt", "metadata": {"topic": "space", "source": "science"}}]


--------------------------------------------------------------------------------
/safe_store/processing/tokenizers.py:
--------------------------------------------------------------------------------
 1 | # safe_store/processing/tokenizers.py
 2 | from typing import Dict, Any, List
 3 | from abc import ABC, abstractmethod
 4 | from safe_store.core.exceptions import ConfigurationError
 5 | 
 6 | class TokenizerWrapper(ABC):
 7 |     """An abstract base class for a standardized tokenizer interface."""
 8 |     @abstractmethod
 9 |     def encode(self, text: str) -> List[int]:
10 |         pass
11 | 
12 |     @abstractmethod
13 |     def decode(self, tokens: List[int]) -> str:
14 |         pass
15 | 
16 | class TikTokenWrapper(TokenizerWrapper):
17 |     """A wrapper for tiktoken's Encoding object."""
18 |     def __init__(self, tokenizer: Any):
19 |         self.tokenizer = tokenizer
20 | 
21 |     def encode(self, text: str) -> List[int]:
22 |         return self.tokenizer.encode(text)
23 | 
24 |     def decode(self, tokens: List[int]) -> str:
25 |         # tiktoken's decode does not take extra arguments
26 |         return self.tokenizer.decode(tokens)
27 | 
28 | class HuggingFaceTokenizerWrapper(TokenizerWrapper):
29 |     """A wrapper for Hugging Face's tokenizer objects."""
30 |     def __init__(self, tokenizer: Any):
31 |         self.tokenizer = tokenizer
32 | 
33 |     def encode(self, text: str) -> List[int]:
34 |         return self.tokenizer.encode(text)
35 | 
36 |     def decode(self, tokens: List[int]) -> str:
37 |         # Hugging Face tokenizers use skip_special_tokens
38 |         return self.tokenizer.decode(tokens, skip_special_tokens=True)
39 | 
40 | 
41 | def get_tokenizer(config: Dict[str, Any]) -> TokenizerWrapper:
42 |     """
43 |     Loads and returns a wrapped tokenizer based on the provided configuration.
44 |     """
45 |     if not isinstance(config, dict) or "name" not in config:
46 |         raise ValueError("Custom tokenizer configuration must be a dictionary with a 'name' key.")
47 | 
48 |     tokenizer_name = config["name"]
49 | 
50 |     if tokenizer_name == "tiktoken":
51 |         try:
52 |             import tiktoken
53 |         except ImportError:
54 |             raise ConfigurationError("The 'tiktoken' library is required. Please run: pip install tiktoken")
55 |         
56 |         model = config.get("model")
57 |         if not model:
58 |             raise ValueError("The 'tiktoken' tokenizer requires a 'model' key (e.g., 'cl100k_base').")
59 |         
60 |         try:
61 |             tokenizer_instance = tiktoken.get_encoding(model)
62 |             return TikTokenWrapper(tokenizer_instance)
63 |         except Exception as e:
64 |             raise ConfigurationError(f"Failed to load tiktoken encoding '{model}': {e}") from e
65 | 
66 |     else:
67 |         raise ValueError(f"Unknown custom tokenizer name: '{tokenizer_name}'")


--------------------------------------------------------------------------------
/safe_store/graph/prompts/graph_extraction_prompt_with_ontology.md:
--------------------------------------------------------------------------------
 1 | **CRITICAL INSTRUCTION: You are a data extraction expert. Your task is to extract entities (nodes) and relationships from the provided text, strictly adhering to the ontology schema below.**
 2 | 
 3 | - **ONLY** extract nodes whose `label` is explicitly defined in the "NODE LABELS" section of the ontology. **If an entity in the text does not match a label in the ontology, you MUST ignore it and not include it in the output.**
 4 | - For each extracted node, **ONLY** include properties that are listed for that specific label in the ontology. Be exhaustive and extract every property defined in the ontology that is present in the text.
 5 | - **ONLY** create relationships where the `type` is explicitly defined in the "RELATIONSHIP TYPES" section.
 6 | - You **MUST** respect the `Source` and `Target` constraints for relationships if they are specified.
 7 | - If an entity or relationship in the text does not fit the ontology, **DO NOT** extract it.
 8 | - Every node's `properties` object **MUST** contain an `identifying_value`. This is a unique name or identifier for the entity (e.g., "John Doe", "Acme Corporation") and is used to link relationships.
 9 | - Format the output as a single JSON object inside a markdown code block.
10 | 
11 | **User Guidance (Follow these additional instructions within the ontology's constraints):**
12 | {user_guidance}
13 | ---
14 | 
15 | **Text to process:**
16 | {chunk_text}
17 | ---
18 | 
19 | **JSON Output Structure (Populate this structure according to the rules):**
20 | ```json
21 | {{
22 |     "nodes": [
23 |         {{
24 |             "label": "LabelFromOntology",
25 |             "properties": {{
26 |                 "identifying_value": "A unique value for this entity (MANDATORY)",
27 |                 "property_from_ontology": "Value from text",
28 |                 "...": "..."
29 |             }}
30 |         }}
31 |     ],
32 |     "relationships": [
33 |         {{
34 |             "source_node_label": "SourceLabelFromOntology",
35 |             "source_node_identifying_value": "Identifier of the source node",
36 |             "target_node_label": "TargetLabelFromOntology",
37 |             "target_node_identifying_value": "Identifier of the target node",
38 |             "type": "RelationshipTypeFromOntology",
39 |             "properties": {{
40 |                 "role": "A role or description if applicable"
41 |             }}
42 |         }}
43 |     ]
44 | }}
45 | ```
46 | **WARNING:**
47 | - For nodes, `label` and `properties.identifying_value` are mandatory.
48 | - For relationships, `source_node_label`, `source_node_identifying_value`, `target_node_label`, `target_node_identifying_value`, and `type` are mandatory.


--------------------------------------------------------------------------------
/safe_store/core/exceptions.py:
--------------------------------------------------------------------------------
 1 | # safe_store/core/exceptions.py
 2 | 
 3 | class SafeStoreError(Exception):
 4 |     """Base class for all safe_store specific errors."""
 5 |     pass
 6 | 
 7 | class DatabaseError(SafeStoreError):
 8 |     """Errors related to database operations (connection, schema, query, transaction)."""
 9 |     pass
10 | 
11 | class FileHandlingError(SafeStoreError):
12 |     """Errors related to file system operations (reading, writing, hashing, not found)."""
13 |     pass
14 | 
15 | class ParsingError(FileHandlingError):
16 |     """Errors occurring during document parsing (subclass of FileHandlingError)."""
17 |     pass
18 | 
19 | class ConfigurationError(SafeStoreError):
20 |     """Errors related to invalid configuration, setup, or missing optional dependencies."""
21 |     pass
22 | 
23 | class IndexingError(SafeStoreError):
24 |     """Errors specifically within the document indexing pipeline (chunking, storage logic)."""
25 |     # Note: ParsingError, VectorizationError cover sub-steps. This is for orchestration.
26 |     pass
27 | 
28 | class VectorizationError(SafeStoreError):
29 |     """Errors related to vectorization processes (model loading, encoding, fitting)."""
30 |     pass
31 | 
32 | class QueryError(SafeStoreError):
33 |     """Errors occurring during query execution (similarity calculation, result fetching)."""
34 |     pass
35 | 
36 | class ConcurrencyError(SafeStoreError):
37 |     """Errors related to file locking or concurrent access issues (e.g., timeouts)."""
38 |     pass
39 | 
40 | class EncryptionError(SafeStoreError):
41 |    """Errors related to data encryption or decryption."""
42 |    pass
43 | 
44 | # --- New Graph-related Exceptions ---
45 | class GraphError(SafeStoreError):
46 |     """Base class for graph-specific errors."""
47 |     pass
48 | 
49 | class GraphDBError(DatabaseError, GraphError): # Inherits from DatabaseError and GraphError
50 |     """Errors related to graph database operations."""
51 |     pass
52 | 
53 | class GraphProcessingError(GraphError):
54 |     """Errors occurring during the processing of text to extract graph elements."""
55 |     pass
56 | 
57 | class LLMCallbackError(GraphProcessingError):
58 |     """Errors related to the LLM processing callback function."""
59 |     pass
60 | 
61 | class NodeNotFoundError(GraphError):
62 |     """Errors occurring during the processing of text to extract graph elements."""
63 |     pass
64 | class RelationshipNotFoundError(GraphError):
65 |     """Errors occurring during the processing of text to extract graph elements."""
66 |     pass
67 | class DocumentNotFoundError(GraphError):
68 |     """Errors occurring during the processing of text or file."""
69 |     pass
70 | 
71 | class GraphEntityFusionError(GraphProcessingError):
72 |     """Errors related to the entity fusion process, including LLM decisions."""
73 |     pass


--------------------------------------------------------------------------------
/examples/basic_usage_text.py:
--------------------------------------------------------------------------------
 1 | from safe_store import SafeStore
 2 | from pathlib import Path
 3 | 
 4 | # --- Cleanup ---
 5 | # Ensure the database from previous runs is removed for a clean start
 6 | db_file = Path("basic_usage_store.db")
 7 | db_file.unlink(missing_ok=True)
 8 | Path(f"{db_file}.lock").unlink(missing_ok=True)
 9 | 
10 | 
11 | # --- 1. Initialize the store with a fixed configuration ---
12 | # All indexing parameters (vectorizer, chunking, cleaning) are now defined
13 | # when the SafeStore instance is created.
14 | print("--- Initializing SafeStore with a fixed configuration ---")
15 | ss = SafeStore(
16 |     db_path=db_file,
17 |     name="my_database",
18 |     description="A cool database demonstrating fixed configuration",
19 |     
20 |     # Vectorizer Configuration
21 |     vectorizer_name="st",
22 |     vectorizer_config={"model": "all-MiniLM-L6-v2"},
23 |     
24 |     # Chunking and Processing Configuration
25 |     chunk_size=10,             # Small chunk size for demonstration (in tokens)
26 |     chunk_overlap=2,           # Small overlap (in tokens)
27 |     chunking_strategy='token', # Use the model's tokenizer for chunking
28 |     expand_before=5,           # Add 5 tokens of context before the vectorized chunk
29 |     expand_after=5,            # Add 5 tokens of context after the vectorized chunk
30 |     text_cleaner='basic'       # Use the built-in basic text cleaner
31 | )
32 | 
33 | # --- 2. Add content ---
34 | # The add_text method is now much simpler. It uses the configuration
35 | # provided when the store was created.
36 | print("\n--- Adding content to the store ---")
37 | text_to_add = "The quick brown fox jumps over the lazy dog. This sentence is used to demonstrate all letters of the alphabet. It is a classic pangram."
38 | ss.add_text(
39 |     unique_id="pangram_text",
40 |     text=text_to_add
41 | )
42 | print(f"Added text with ID 'pangram_text'.")
43 | 
44 | 
45 | # --- 3. Query the store ---
46 | # The query method also uses the instance's configured vectorizer automatically.
47 | print("\n--- Querying the store ---")
48 | query = "a speedy fox"
49 | results = ss.query(query)
50 | 
51 | print(f"Query: '{query}'")
52 | for r in results:
53 |     print("-" * 20)
54 |     print(f"Similarity: {r['similarity_percent']:.2f}%")
55 |     # The 'chunk_text' returned is the EXPANDED text for better context.
56 |     print(f"Stored (expanded) chunk: '{r['chunk_text']}'")
57 | 
58 | # --- 4. Vectorize text directly (optional) ---
59 | # This method uses the instance's configured vectorizer.
60 | print("\n--- Vectorizing a new sentence directly ---")
61 | v1 = ss.vectorize_text("Hello there")
62 | print(f"Successfully vectorized a new sentence. Vector dimension: {v1.shape}")
63 | 
64 | # The store is automatically closed if used in a 'with' block,
65 | # or you can call ss.close() manually.
66 | ss.close()
67 | print("\n--- Example finished ---")


--------------------------------------------------------------------------------
/safe_store/processing/text_cleaning.py:
--------------------------------------------------------------------------------
 1 | # safe_store/processing/text_cleaning.py
 2 | import re
 3 | from typing import Callable, Union
 4 | 
 5 | def basic_text_cleaner(text: str) -> str:
 6 |     """
 7 |     An enhanced text cleaner that performs several common cleanup tasks, designed
 8 |     to be safe for code and structured text while improving quality for LLMs.
 9 | 
10 |     - Normalizes all line endings to a single newline character (`\n`).
11 |     - Removes non-printable ASCII control characters (except tab and newline) that
12 |       can break LLM tokenizers.
13 |     - Preserves leading whitespace (indentation) on each line, which is crucial for code.
14 |     - Replaces repetitive dot sequences (e.g., '....') with a standard ellipsis ('...').
15 |     - Collapses multiple spaces *within* a line into a single space, but leaves indentation untouched.
16 |     - Reduces three or more consecutive newlines down to just two, preserving paragraph
17 |       breaks without creating excessive empty space. Single newlines are kept.
18 | 
19 |     Args:
20 |         text: The input string to clean.
21 | 
22 |     Returns:
23 |         The cleaned string.
24 |     """
25 |     if not isinstance(text, str):
26 |         return ""
27 | 
28 |     # 1. Normalize line endings to \n.
29 |     text = text.replace('\r\n', '\n').replace('\r', '\n')
30 | 
31 |     # 2. Remove non-printable control characters except for tab, newline.
32 |     text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f]', '', text)
33 | 
34 |     # 3. Replace long sequences of dots with a standard ellipsis.
35 |     text = re.sub(r'\.{4,}', '...', text)
36 |     
37 |     # 4. Process line by line to preserve indentation while cleaning inline spaces.
38 |     lines = text.split('\n')
39 |     cleaned_lines = []
40 |     for line in lines:
41 |         # Separate leading whitespace (indentation) from the rest of the content
42 |         match = re.match(r'^(\s*)', line)
43 |         leading_whitespace = match.group(1) if match else ""
44 |         content = line[len(leading_whitespace):]
45 |         
46 |         # Collapse multiple spaces in the content part only
47 |         cleaned_content = re.sub(r' {2,}', ' ', content)
48 |         
49 |         cleaned_lines.append(leading_whitespace + cleaned_content)
50 |     
51 |     text = '\n'.join(cleaned_lines)
52 | 
53 |     # 5. Reduce 3 or more newlines to a maximum of two.
54 |     text = re.sub(r'\n{3,}', '\n\n', text)
55 |     
56 |     return text.strip()
57 | 
58 | 
59 | def get_cleaner(cleaner: Union[str, Callable[[str], str], None]) -> Callable[[str], str]:
60 |     """
61 |     Returns a callable cleaner function.
62 | 
63 |     Args:
64 |         cleaner: Can be the name of a predefined cleaner ('basic') or a custom
65 |                  callable function. If None, returns an identity function that
66 |                  does nothing.
67 | 
68 |     Returns:
69 |         A callable function that takes a string and returns a string.
70 |     """
71 |     if cleaner is None:
72 |         return lambda x: x # Identity function
73 |     if callable(cleaner):
74 |         return cleaner
75 |     if isinstance(cleaner, str):
76 |         if cleaner == 'basic':
77 |             return basic_text_cleaner
78 |         else:
79 |             raise ValueError(f"Unknown predefined cleaner: '{cleaner}'")
80 |     raise TypeError("cleaner must be a string, a callable, or None")


--------------------------------------------------------------------------------
/examples/encryption_usage.py:
--------------------------------------------------------------------------------
 1 | # examples/encryption_usage.py
 2 | """
 3 | Demonstrates using safe_store's encryption feature.
 4 | """
 5 | import safe_store
 6 | from pathlib import Path
 7 | import shutil
 8 | import sqlite3
 9 | 
10 | # --- Configuration ---
11 | DB_FILE = "encrypted_example_store.db"
12 | ENCRYPTION_KEY = "this-is-my-secret-Pa$$wOrd!"
13 | DOC_DIR = Path("temp_docs_encrypted")
14 | VECTORIZER_NAME = "st"
15 | VECTORIZER_CONFIG = {"model": "all-MiniLM-L6-v2"}
16 | 
17 | def print_header(title):
18 |     print("\n" + "="*10 + f" {title} " + "="*10)
19 | 
20 | def cleanup():
21 |     print_header("Cleaning Up")
22 |     for p in [DB_FILE, f"{DB_FILE}.lock", f"{DB_FILE}-wal", f"{DB_FILE}-shm"]:
23 |         Path(p).unlink(missing_ok=True)
24 |     if DOC_DIR.exists(): shutil.rmtree(DOC_DIR)
25 |     print("- Cleanup complete.")
26 | 
27 | if __name__ == "__main__":
28 |     cleanup()
29 | 
30 |     DOC_DIR.mkdir(exist_ok=True)
31 |     doc_path = DOC_DIR / "secret_notes.txt"
32 |     doc_path.write_text("Project Phoenix: Launch date is Q4. Key personnel: Alice, Bob.")
33 | 
34 |     # --- 1. Initialize SafeStore WITH Encryption Key and Vectorizer ---
35 |     print_header("Initializing Encrypted Store")
36 |     store_encrypted = safe_store.SafeStore(
37 |         DB_FILE,
38 |         vectorizer_name=VECTORIZER_NAME,
39 |         vectorizer_config=VECTORIZER_CONFIG,
40 |         log_level=safe_store.LogLevel.INFO,
41 |         encryption_key=ENCRYPTION_KEY
42 |     )
43 | 
44 |     # --- 2. Add Document to Encrypted Store ---
45 |     print_header("Adding Document (Encrypted)")
46 |     with store_encrypted:
47 |         store_encrypted.add_document(doc_path, metadata={"sensitivity": "high"})
48 |         print(f"- Added '{doc_path.name}'.")
49 | 
50 |         # Direct DB check
51 |         conn = sqlite3.connect(store_encrypted.db_path)
52 |         is_encrypted_flag = conn.execute("SELECT is_encrypted FROM chunks LIMIT 1").fetchone()[0]
53 |         conn.close()
54 |         if is_encrypted_flag == 1:
55 |             print("[VERIFIED] Direct DB check: is_encrypted flag is set.")
56 |         else:
57 |             print("[WARNING] Direct DB check: is_encrypted flag is NOT set.")
58 | 
59 |         # --- 3. Query Encrypted Store (With Key) ---
60 |         print_header("Querying Encrypted Store (With Key)")
61 |         query = "project personnel"
62 |         results = store_encrypted.query(query, top_k=1)
63 |         if results:
64 |             print(f"    Text: '{results[0]['chunk_text']}'")
65 |             assert "[Encrypted" not in results[0]['chunk_text']
66 | 
67 |     # --- 4. Access Encrypted DB WITHOUT the Key ---
68 |     print_header("Accessing Encrypted Store WITHOUT Key")
69 |     store_no_key = safe_store.SafeStore(DB_FILE, vectorizer_name=VECTORIZER_NAME, vectorizer_config=VECTORIZER_CONFIG)
70 |     with store_no_key:
71 |         results_no_key = store_no_key.query("security protocol", top_k=1)
72 |         if results_no_key:
73 |             print(f"    Text: '{results_no_key[0]['chunk_text']}'")
74 |             assert results_no_key[0]['chunk_text'] == "[Encrypted - Key Unavailable]"
75 | 
76 |     # --- 5. Access Encrypted DB with WRONG Key ---
77 |     print_header("Accessing Encrypted Store With WRONG Key")
78 |     store_wrong_key = safe_store.SafeStore(DB_FILE, vectorizer_name=VECTORIZER_NAME, vectorizer_config=VECTORIZER_CONFIG, encryption_key="wrong-key")
79 |     with store_wrong_key:
80 |         results_wrong_key = store_wrong_key.query("launch date", top_k=1)
81 |         if results_wrong_key:
82 |             print(f"    Text: '{results_wrong_key[0]['chunk_text']}'")
83 |             assert results_wrong_key[0]['chunk_text'] == "[Encrypted - Decryption Failed]"


--------------------------------------------------------------------------------
/safe_store/utils/json_parsing.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import json
 3 | from ascii_colors import ASCIIColors, trace_exception
 4 | 
 5 | import re
 6 | import json
 7 | from collections import OrderedDict
 8 | 
 9 | def robust_json_parser(json_string: str) -> dict:
10 |     json_string = re.sub(r"^```(?:json)?\s*|\s*```$", '', json_string.strip())
11 | 
12 |     try:
13 |         return json.loads(json_string)
14 |     except json.JSONDecodeError as ex:
15 |         err = ex
16 | 
17 |     json_match = re.search(r'(\{[\s\S]*\}|\[[\s\S]*\])', json_string)
18 |     cleaned_string = json_match.group(0) if json_match else json_string
19 | 
20 |     try:
21 |         cleaned_string = re.sub(r'\bTrue\b', 'true', cleaned_string)
22 |         cleaned_string = re.sub(r'\bFalse\b', 'false', cleaned_string)
23 |         cleaned_string = re.sub(r'\bNone\b', 'null', cleaned_string)
24 |         cleaned_string = re.sub(r'\b(undefined|NaN|Infinity|-Infinity)\b', 'null', cleaned_string)
25 | 
26 |         cleaned_string = re.sub(r'//.*', '', cleaned_string)
27 |         cleaned_string = re.sub(r'/\*[\s\S]*?\*/', '', cleaned_string)
28 | 
29 |         cleaned_string = re.sub(r'\\([_`*#\-])', r'\1', cleaned_string)
30 |         cleaned_string = re.sub(r',\s*(\}|\])', r'\1', cleaned_string)
31 | 
32 |         cleaned_string = re.sub(r'\}\s*\{', '},{', cleaned_string)
33 | 
34 |         def escape_newlines_in_strings(text: str) -> str:
35 |             in_string = False
36 |             result = []
37 |             i = 0
38 |             while i < len(text):
39 |                 c = text[i]
40 |                 if c == '"' and (i == 0 or text[i - 1] != '\\'):
41 |                     in_string = not in_string
42 |                 if in_string and c == '\n':
43 |                     result.append('\\n')
44 |                 else:
45 |                     result.append(c)
46 |                 i += 1
47 |             return ''.join(result)
48 | 
49 |         cleaned_string = escape_newlines_in_strings(cleaned_string)
50 | 
51 |         def escape_unescaped_inner_quotes(text: str) -> str:
52 |             def fix(match):
53 |                 s = match.group(0)
54 |                 inner = s[1:-1]
55 |                 inner_fixed = re.sub(r'(?<!\\)"', r'\\"', inner)
56 |                 return f'"{inner_fixed}"'
57 |             return re.sub(r'"(?:[^"\\]|\\.)*"', fix, text)
58 | 
59 |         cleaned_string = escape_unescaped_inner_quotes(cleaned_string)
60 | 
61 |         cleaned_string = re.sub(
62 |             r"(?<=[:\[,])\s*'([^']*?)'\s*(?=[,\}\]])", 
63 |             lambda m: '"' + m.group(1).replace('"', '\\"') + '"', 
64 |             cleaned_string
65 |         )
66 |         cleaned_string = re.sub(
67 |             r"(?<=\{)\s*'([^']*?)'\s*:", 
68 |             lambda m: '"' + m.group(1).replace('"', '\\"') + '":', 
69 |             cleaned_string
70 |         )
71 |         cleaned_string = re.sub(r'(?<={|,)\s*([A-Za-z0-9_]+)\s*:', r'"\1":', cleaned_string)
72 | 
73 |         cleaned_string = cleaned_string.replace("...", "null")
74 | 
75 |         cleaned_string = re.sub(r'[\x00-\x1F\x7F\u00A0]', '', cleaned_string)
76 |         cleaned_string = cleaned_string.replace("“", '"').replace("”", '"').replace("‘", "'").replace("’", "'")
77 |         cleaned_string = re.sub(r'"\s*\n\s*"', '"\\n"', cleaned_string)
78 | 
79 |         def balance_brackets(s: str) -> str:
80 |             stack = []
81 |             for c in s:
82 |                 if c in "{[":
83 |                     stack.append(c)
84 |                 elif c in "}]":
85 |                     if stack and ((stack[-1] == '{' and c == '}') or (stack[-1] == '[' and c == ']')):
86 |                         stack.pop()
87 |             for opener in reversed(stack):
88 |                 s += '}' if opener == '{' else ']'
89 |             return s
90 | 
91 |         cleaned_string = balance_brackets(cleaned_string)
92 | 
93 |         return json.loads(cleaned_string, object_pairs_hook=OrderedDict)
94 | 
95 |     except json.JSONDecodeError as e:
96 |         raise ValueError(f"Failed to parse JSON. Final error: {e}") from e
97 | 


--------------------------------------------------------------------------------
/docs/logging.rst:
--------------------------------------------------------------------------------
 1 | =======
 2 | Logging
 3 | =======
 4 | 
 5 | ``safe_store`` uses the `ascii_colors <https://github.com/ParisNeo/ascii_colors>`_ library for internal logging, providing clear, leveled, and colorful console output by default.
 6 | 
 7 | Default Behavior
 8 | ----------------
 9 | 
10 | *   Logs are printed directly to the console (stderr).
11 | *   Messages are color-coded based on severity (DEBUG, INFO, SUCCESS, WARNING, ERROR, CRITICAL).
12 | *   The default logging level is ``INFO``. This means only messages with severity INFO, SUCCESS, WARNING, ERROR, and CRITICAL will be displayed. DEBUG messages are hidden.
13 | 
14 | Changing the Log Level
15 | ----------------------
16 | 
17 | You can easily change the minimum severity level displayed when initializing ``safe_store``:
18 | 
19 | .. code-block:: python
20 | 
21 |    import safe_store
22 |    from safe_store import LogLevel # Or from ascii_colors import LogLevel
23 | 
24 |    # Show only warnings and errors
25 |    store_warn = safe_store.SafeStore("my_store_warn.db", log_level=LogLevel.WARNING)
26 | 
27 |    # Show all messages, including detailed debug info
28 |    store_debug = safe_store.SafeStore("my_store_debug.db", log_level=LogLevel.DEBUG)
29 | 
30 | Advanced Configuration (Global)
31 | -------------------------------
32 | 
33 | Since ``safe_store`` uses ``ascii_colors``, you can configure logging globally for your entire application *before* initializing ``safe_store``. This allows you to:
34 | 
35 | *   Log messages to a file.
36 | *   Change the output format.
37 | *   Use JSON formatting.
38 | *   Add multiple handlers (e.g., log DEBUG to file, INFO to console).
39 | *   Disable console logging entirely.
40 | 
41 | Here's how to configure ``ascii_colors`` globally:
42 | 
43 | .. code-block:: python
44 | 
45 |    import safe_store
46 |    from ascii_colors import ASCIIColors, LogLevel, FileHandler, Formatter, JSONFormatter
47 |    import logging # Standard logging Formatter can also be used
48 | 
49 |    # --- Example 1: Set global level and log to file ---
50 |    ASCIIColors.set_log_level(LogLevel.DEBUG) # Apply DEBUG level globally
51 | 
52 |    # Create a file handler
53 |    log_file = "app_activity.log"
54 |    file_handler = FileHandler(log_file, encoding='utf-8')
55 | 
56 |    # Set a specific format for the file
57 |    file_formatter = Formatter(
58 |        "%(asctime)s - %(name)s - %(levelname)s - %(message)s",
59 |        datefmt="%Y-%m-%d %H:%M:%S"
60 |    )
61 |    file_handler.setFormatter(file_formatter)
62 | 
63 |    # Add the file handler to ascii_colors
64 |    ASCIIColors.add_handler(file_handler)
65 | 
66 |    # Optional: If you ONLY want file logging, remove the default console handler
67 |    # default_console_handler = ASCIIColors.get_default_handler()
68 |    # if default_console_handler:
69 |    #    ASCIIColors.remove_handler(default_console_handler)
70 | 
71 |    print(f"Logging DEBUG and above to console (default) and {log_file}")
72 | 
73 |    # Now initialize safe_store - it will respect the global settings
74 |    store = safe_store.SafeStore("my_store.db")
75 |    # ... use store ...
76 |    # safe_store's internal DEBUG messages will now appear in the file
77 | 
78 | 
79 |    # --- Example 2: JSON logging to file ---
80 |    # Clear previous handlers if starting fresh configuration
81 |    # ASCIIColors.reset() # Or ASCIIColors.clear_handlers()
82 | 
83 |    # ASCIIColors.set_log_level(LogLevel.INFO) # Set desired level
84 | 
85 |    # json_handler = FileHandler("app_log.jsonl", encoding='utf-8')
86 |    # json_formatter = JSONFormatter()
87 |    # json_handler.setFormatter(json_formatter)
88 |    # ASCIIColors.add_handler(json_handler)
89 | 
90 |    # # Optionally remove console handler
91 |    # # default_console_handler = ASCIIColors.get_default_handler()
92 |    # # if default_console_handler: ASCIIColors.remove_handler(default_console_handler)
93 | 
94 |    # store_json = safe_store.SafeStore("my_json_store.db")
95 |    # ... use store_json ...
96 | 
97 | See the `ascii_colors documentation <https://github.com/ParisNeo/ascii_colors#usage>`_ for more details on handlers and formatters.
98 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [build-system]
  2 | requires = ["hatchling"]
  3 | build-backend = "hatchling.build"
  4 | 
  5 | [project]
  6 | name = "safe-store"
  7 | # Bump version for this release
  8 | version = "3.3.3"
  9 | description = "Simple, concurrent SQLite-based vector store optimized for local RAG pipelines, with optional encryption."
 10 | readme = "README.md"
 11 | requires-python = ">=3.8"
 12 | license = { file = "LICENSE" }
 13 | authors = [
 14 |     { name = "ParisNeo", email = "parisneo_ai@gmail.com" },
 15 | ]
 16 | keywords = ["vector", "database", "sqlite", "rag", "llm", "embedding", "semantic search", "local", "concurrent", "encryption", "webui"]
 17 | classifiers = [
 18 |     "Development Status :: 5 - Production/Stable", # Or "5 - Production/Stable" if ready
 19 |     "Programming Language :: Python :: 3",
 20 |     "Programming Language :: Python :: 3.8",
 21 |     "Programming Language :: Python :: 3.9",
 22 |     "Programming Language :: Python :: 3.10",
 23 |     "Programming Language :: Python :: 3.11",
 24 |     "Programming Language :: Python :: 3.12",
 25 |     "License :: OSI Approved :: Apache Software License",
 26 |     "Operating System :: OS Independent",
 27 |     "Intended Audience :: Developers",
 28 |     "Intended Audience :: Science/Research",
 29 |     "Topic :: Database",
 30 |     "Topic :: Scientific/Engineering :: Artificial Intelligence",
 31 |     "Topic :: Software Development :: Libraries :: Python Modules",
 32 |     "Topic :: Text Processing :: Indexing",
 33 |     "Topic :: Security :: Cryptography", # Added classifier
 34 | ]
 35 | # Core dependencies
 36 | dependencies = [
 37 |     "pipmaster>=1.0.8",
 38 |     "numpy>=1.21",
 39 |     "ascii_colors>=0.11.6", # For logging/console output
 40 |     "filelock>=3.9",       # For concurrency control
 41 |     "sqlalchemy"
 42 | ]
 43 | 
 44 | [project.optional-dependencies]
 45 | # --- Parsing Libraries ---
 46 | parsing = [
 47 |     "pypdf>=3.10",
 48 |     "python-docx>=1.0",
 49 |     "beautifulsoup4>=4.11",
 50 |     "lxml>=4.9", # Recommended HTML parser
 51 | ]
 52 | 
 53 | # --- Vectorizer Libraries ---
 54 | sentence-transformers = ["sentence-transformers==4.1.0"]
 55 | tfidf = ["scikit-learn>=1.0"]
 56 | # Future vectorizers can be added here:
 57 | openai = ["openai>=1.0"]
 58 | ollama = ["ollama"]
 59 | 
 60 | # --- Encryption Library ---
 61 | encryption = ["cryptography>=40.0"] # Added Encryption
 62 | 
 63 | # --- Combined Extras ---
 64 | all-vectorizers = [
 65 |     "safe-store[sentence-transformers]",
 66 |     "safe-store[tfidf]",
 67 |     "safe-store[ollama]",
 68 |     "safe-store[openai]",
 69 | ]
 70 | # Combined extra for everything (parsing, all vectorizers, encryption, webui)
 71 | all = [
 72 |     "safe-store[parsing]",
 73 |     "safe-store[all-vectorizers]",
 74 |     "safe-store[encryption]",
 75 | ]
 76 | # Extra for development dependencies (testing, linting, building, docs)
 77 | dev = [
 78 |     "safe-store[all]", # Dev should include all optional features
 79 |     "pytest>=7.0",
 80 |     "pytest-cov>=3.0",
 81 |     "flake8>=5.0",     # Or ruff
 82 |     "black>=22.0",
 83 |     "mypy>=0.9",
 84 |     "types-filelock",
 85 |     "types-cryptography", # Added types for cryptography
 86 |     # Documentation tools
 87 |     "Sphinx>=5.0",
 88 |     "sphinx-rtd-theme>=1.0",
 89 |     "lxml", # Needed again for docutils used by sphinx if not already installed
 90 |     # Build tools
 91 |     "hatchling",
 92 |     "wheel",
 93 |     "twine", # For checking/uploading packages
 94 | ]
 95 | 
 96 | [project.urls]
 97 | Homepage = "https://github.com/ParisNeo/safe_store"
 98 | Repository = "https://github.com/ParisNeo/safe_store"
 99 | Documentation = "https://github.com/ParisNeo/safe_store#readme" # Link to README initially, update later if dedicated docs site exists
100 | Issues = "https://github.com/ParisNeo/safe_store/issues"
101 | 
102 | # --- Tool Configurations ---
103 | [tool.black]
104 | line-length = 88
105 | target-version = ['py38']
106 | 
107 | [tool.hatch.version]
108 | path = "safe_store/__init__.py"
109 | 
110 | [tool.hatch.build]
111 | include = [
112 |   "safe_store"
113 | ]
114 | 


--------------------------------------------------------------------------------
/tests/security/test_encryption.py:
--------------------------------------------------------------------------------
 1 | # tests/security/test_encryption.py
 2 | import pytest
 3 | from safe_store.security.encryption import Encryptor, CRYPTOGRAPHY_AVAILABLE
 4 | from safe_store.core.exceptions import EncryptionError, ConfigurationError
 5 | 
 6 | # Conditionally skip tests if cryptography is not installed
 7 | pytestmark = pytest.mark.skipif(not CRYPTOGRAPHY_AVAILABLE, reason="Requires cryptography library")
 8 | 
 9 | @pytest.fixture
10 | def password() -> str:
11 |     return "test-password-123!"
12 | 
13 | @pytest.fixture
14 | def encryptor_instance(password: str) -> Encryptor:
15 |     return Encryptor(password)
16 | 
17 | def test_encryptor_init_no_password():
18 |     encryptor = Encryptor(None)
19 |     assert not encryptor.is_enabled
20 |     with pytest.raises(EncryptionError, match="Encryption is not enabled"):
21 |         encryptor.encrypt("test")
22 |     with pytest.raises(EncryptionError, match="Decryption is not enabled"):
23 |         encryptor.decrypt(b"somebytes")
24 | 
25 | def test_encryptor_init_empty_password():
26 |     with pytest.raises(ValueError, match="non-empty string"):
27 |         Encryptor("")
28 | 
29 | def test_encryptor_init_with_password(encryptor_instance: Encryptor):
30 |     assert encryptor_instance.is_enabled
31 |     assert encryptor_instance._fernet is not None
32 | 
33 | def test_derive_key_consistency(password: str):
34 |     """Ensure the same password yields the same key (due to fixed salt)."""
35 |     key1 = Encryptor._derive_key(password)
36 |     key2 = Encryptor._derive_key(password)
37 |     assert key1 == key2
38 |     assert isinstance(key1, bytes)
39 | 
40 | def test_derive_key_different_passwords(password: str):
41 |     key1 = Encryptor._derive_key(password)
42 |     key2 = Encryptor._derive_key(password + "extra")
43 |     assert key1 != key2
44 | 
45 | def test_encrypt_decrypt_success(encryptor_instance: Encryptor):
46 |     original_data = "This is sensitive data."
47 |     encrypted_token = encryptor_instance.encrypt(original_data)
48 |     assert isinstance(encrypted_token, bytes)
49 |     assert encrypted_token != original_data.encode('utf-8')
50 | 
51 |     decrypted_data = encryptor_instance.decrypt(encrypted_token)
52 |     assert isinstance(decrypted_data, str)
53 |     assert decrypted_data == original_data
54 | 
55 | def test_encrypt_non_string(encryptor_instance: Encryptor):
56 |     with pytest.raises(TypeError, match="must be a string"):
57 |         encryptor_instance.encrypt(b"bytes data") # type: ignore
58 |     with pytest.raises(TypeError, match="must be a string"):
59 |         encryptor_instance.encrypt(123) # type: ignore
60 | 
61 | def test_decrypt_non_bytes(encryptor_instance: Encryptor):
62 |     with pytest.raises(TypeError, match="must be bytes"):
63 |         encryptor_instance.decrypt("string data") # type: ignore
64 |     with pytest.raises(TypeError, match="must be bytes"):
65 |         encryptor_instance.decrypt(123) # type: ignore
66 | 
67 | def test_decrypt_invalid_token(encryptor_instance: Encryptor):
68 |     invalid_token = b"not_a_valid_fernet_token"
69 |     with pytest.raises(EncryptionError, match="Invalid token"):
70 |         encryptor_instance.decrypt(invalid_token)
71 | 
72 | def test_decrypt_tampered_token(encryptor_instance: Encryptor):
73 |     original_data = "Original message."
74 |     encrypted_token = encryptor_instance.encrypt(original_data)
75 |     # Tamper slightly (e.g., flip a bit - simplistic tamper)
76 |     tampered_token = bytearray(encrypted_token)
77 |     tampered_token[-1] = tampered_token[-1] ^ 1 # Flip last bit
78 |     tampered_token_bytes = bytes(tampered_token)
79 | 
80 |     with pytest.raises(EncryptionError, match="Invalid token"):
81 |         encryptor_instance.decrypt(tampered_token_bytes)
82 | 
83 | def test_decrypt_wrong_key(password: str):
84 |     encryptor1 = Encryptor(password)
85 |     encryptor2 = Encryptor(password + "_different")
86 | 
87 |     original_data = "Secret info."
88 |     encrypted_token = encryptor1.encrypt(original_data)
89 | 
90 |     # Attempt decryption with the wrong key
91 |     with pytest.raises(EncryptionError, match="Invalid token"):
92 |         encryptor2.decrypt(encrypted_token)
93 | 


--------------------------------------------------------------------------------
/docs/quickstart.rst:
--------------------------------------------------------------------------------
 1 | ==========
 2 | Quick Start
 3 | ==========
 4 | 
 5 | Here's a basic example demonstrating indexing and querying:
 6 | 
 7 | .. code-block:: python
 8 | 
 9 |     import safe_store
10 |     from pathlib import Path
11 |     import time # For demonstrating concurrency
12 | 
13 |     # --- 1. Prepare Sample Documents ---
14 |     doc_dir = Path("my_docs")
15 |     doc_dir.mkdir(exist_ok=True)
16 |     doc1_path = doc_dir / "doc1.txt"
17 |     doc1_path.write_text("safe_store makes local vector storage simple and efficient.", encoding='utf-8')
18 |     doc2_path = doc_dir / "doc2.html"
19 |     doc2_path.write_text("<html><body><p>HTML content can also be indexed.</p></body></html>", encoding='utf-8')
20 | 
21 |     print(f"Created sample files in: {doc_dir.resolve()}")
22 | 
23 |     # --- 2. Initialize safe_store ---
24 |     # Use DEBUG level for more verbose output, adjust lock timeout if needed
25 |     # Add encryption_key="your-secret-password" to enable encryption
26 |     store = safe_store.SafeStore(
27 |         "my_vector_store.db",
28 |         log_level=safe_store.LogLevel.DEBUG,
29 |         lock_timeout=10 # Wait up to 10s for write lock
30 |         # encryption_key="your-secret-password" # Uncomment to enable
31 |     )
32 | 
33 |     # Best practice: Use safe_store as a context manager
34 |     try:
35 |         with store:
36 |             # --- 3. Add Documents (acquires write lock) ---
37 |             print("\n--- Indexing Documents ---")
38 |             # Requires safe_store[sentence-transformers]
39 |             store.add_document(doc1_path, vectorizer_name="st:all-MiniLM-L6-v2", chunk_size=50, chunk_overlap=10)
40 | 
41 |             # Requires safe_store[parsing] for HTML
42 |             store.add_document(doc2_path, vectorizer_name="st:all-MiniLM-L6-v2")
43 | 
44 |             # Add TF-IDF vectors as well (requires safe_store[tfidf])
45 |             # This will fit TF-IDF on all documents
46 |             print("\n--- Adding TF-IDF Vectorization ---")
47 |             store.add_vectorization("tfidf:my_analysis")
48 | 
49 |             # --- 4. Query (read operation, concurrent with WAL) ---
50 |             print("\n--- Querying using Sentence Transformer ---")
51 |             query_st = "simple storage"
52 |             results_st = store.query(query_st, vectorizer_name="st:all-MiniLM-L6-v2", top_k=2)
53 |             for i, res in enumerate(results_st):
54 |                 print(f"ST Result {i+1}: Score={res['similarity']:.4f}, Path='{Path(res['file_path']).name}', Text='{res['chunk_text'][:60]}...'")
55 | 
56 |             print("\n--- Querying using TF-IDF ---")
57 |             query_tfidf = "html index"
58 |             results_tfidf = store.query(query_tfidf, vectorizer_name="tfidf:my_analysis", top_k=2)
59 |             for i, res in enumerate(results_tfidf):
60 |                 print(f"TFIDF Result {i+1}: Score={res['similarity']:.4f}, Path='{Path(res['file_path']).name}', Text='{res['chunk_text'][:60]}...'")
61 | 
62 |             # --- 5. List Methods ---
63 |             print("\n--- Listing Vectorization Methods ---")
64 |             methods = store.list_vectorization_methods()
65 |             for method in methods:
66 |                 print(f"- ID: {method['method_id']}, Name: {method['method_name']}, Type: {method['method_type']}, Dim: {method['vector_dim']}")
67 | 
68 |     except safe_store.ConfigurationError as e:
69 |         print(f"\n[ERROR] Missing dependency: {e}")
70 |         print("Please install the required extras (e.g., pip install safe_store[all])")
71 |     except safe_store.ConcurrencyError as e:
72 |         print(f"\n[ERROR] Lock timeout or concurrency issue: {e}")
73 |     except Exception as e:
74 |         print(f"\n[ERROR] An unexpected error occurred: {e}")
75 |     finally:
76 |         # Connection is closed automatically by the 'with' statement exit
77 |         print("\n--- Store context closed ---")
78 |         # Cleanup (optional)
79 |         # import shutil
80 |         # shutil.rmtree(doc_dir)
81 |         # Path("my_vector_store.db").unlink(missing_ok=True)
82 |         # Path("my_vector_store.db.lock").unlink(missing_ok=True)
83 | 
84 |     print("\nCheck 'my_vector_store.db' and console logs.")
85 | 
86 | 


--------------------------------------------------------------------------------
/safe_store/vectorization/methods/sentense_transformer/__init__.py:
--------------------------------------------------------------------------------
 1 | # safe_store/vectorization/methods/st.py
 2 | import numpy as np
 3 | from typing import List, Optional, Dict, Any
 4 | from safe_store.vectorization.base import BaseVectorizer
 5 | from safe_store.core.exceptions import ConfigurationError, VectorizationError
 6 | from safe_store.processing.tokenizers import HuggingFaceTokenizerWrapper
 7 | from ascii_colors import ASCIIColors, trace_exception
 8 | import pipmaster as pm
 9 | 
10 | class_name="STVectorizer"
11 | 
12 | try:
13 |     pm.ensure_packages(["torch","torchvision","sentence-transformers"])
14 |     from sentence_transformers import SentenceTransformer
15 | except Exception as e:
16 |     trace_exception(e)
17 |     SentenceTransformer = None
18 | 
19 | 
20 | def list_available_models(**kwargs) -> List[str]:
21 |     """
22 |     Returns a curated list of popular and effective Sentence Transformer models.
23 |     This list is static as querying the Hugging Face Hub dynamically is not practical.
24 |     """
25 |     return [
26 |         "all-MiniLM-L6-v2",
27 |         "all-mpnet-base-v2",
28 |         "multi-qa-mpnet-base-dot-v1",
29 |         "all-distilroberta-v1",
30 |         "paraphrase-albert-small-v2",
31 |         "LaBSE"
32 |     ]
33 | 
34 | class STVectorizer(BaseVectorizer):
35 |     """Vectorizes text using models from the sentence-transformers library."""
36 | 
37 |     DEFAULT_MODEL: str = "all-MiniLM-L6-v2"
38 | 
39 |     def __init__(self, model_config: Dict[str, Any], cache_folder: Optional[str] = None, **kwargs):
40 |         super().__init__(vectorizer_name="st")
41 | 
42 |         if SentenceTransformer is None:
43 |             raise ConfigurationError("STVectorizer requires 'sentence-transformers'. Install with: pip install safe_store[sentence-transformers]")
44 | 
45 |         self.model_name: str = model_config.get("model", self.DEFAULT_MODEL)
46 |         if not self.model_name:
47 |              raise ConfigurationError("STVectorizer config must include a 'model' key.")
48 | 
49 |         try:
50 |             self.model: SentenceTransformer = SentenceTransformer(self.model_name, cache_folder=cache_folder)
51 |             self._dim: int = self.model.get_sentence_embedding_dimension()
52 |             self._dtype: np.dtype = np.dtype(np.float32)
53 |             ASCIIColors.info(f"Model '{self.model_name}' loaded. Dimension: {self._dim}")
54 |         except Exception as e:
55 |             raise VectorizationError(f"Failed to load Sentence Transformer model '{self.model_name}': {e}") from e
56 | 
57 |     def get_tokenizer(self) -> Optional[HuggingFaceTokenizerWrapper]:
58 |         """Returns the tokenizer from the loaded SentenceTransformer model, wrapped."""
59 |         if hasattr(self.model, 'tokenizer'):
60 |             return HuggingFaceTokenizerWrapper(self.model.tokenizer)
61 |         return None
62 | 
63 |     def vectorize(self, texts: List[str]) -> np.ndarray:
64 |         if not texts:
65 |             return np.empty((0, self.dim), dtype=self.dtype)
66 |         try:
67 |             embeddings = self.model.encode(texts, convert_to_numpy=True, show_progress_bar=False)
68 |             if not isinstance(embeddings, np.ndarray):
69 |                  raise VectorizationError("SentenceTransformer model did not return a NumPy array.")
70 |             if embeddings.dtype != self._dtype:
71 |                 embeddings = embeddings.astype(self._dtype)
72 |             return embeddings
73 |         except Exception as e:
74 |             raise VectorizationError(f"Error during sentence-transformer encoding: {e}") from e
75 | 
76 |     @property
77 |     def dim(self) -> int:
78 |         return self._dim
79 | 
80 |     @property
81 |     def dtype(self) -> np.dtype:
82 |         return self._dtype
83 | 
84 |     @staticmethod
85 |     def list_models(**kwargs) -> List[str]:
86 |         """
87 |         Returns a list of popular SentenceTransformer models.
88 |         This is not an exhaustive list from an API but a curated selection.
89 |         """
90 |         return [
91 |             "all-MiniLM-L6-v2",
92 |             "all-mpnet-base-v2",
93 |             "paraphrase-multilingual-MiniLM-L12-v2",
94 |             "distiluse-base-multilingual-cased-v1",
95 |             "all-roberta-large-v1"
96 |         ]


--------------------------------------------------------------------------------
/examples/dynamic_model_selection.py:
--------------------------------------------------------------------------------
  1 | # examples/dynamic_model_selection.py
  2 | """
  3 | This example demonstrates how to use the `list_available_models` class method
  4 | to dynamically discover and select a model from a running Ollama instance,
  5 | and then use it to create and query a SafeStore.
  6 | """
  7 | import safe_store
  8 | from pathlib import Path
  9 | import shutil
 10 | 
 11 | # --- Configuration ---
 12 | DB_FILE = "dynamic_ollama_store.db"
 13 | # This example assumes an Ollama server is running at the default host.
 14 | # If your Ollama server is elsewhere, you can specify it:
 15 | # OLLAMA_HOST = "http://192.168.1.10:11434"
 16 | OLLAMA_HOST = "http://localhost:11434"
 17 | 
 18 | def cleanup():
 19 |     """Removes the database file from previous runs."""
 20 |     Path(DB_FILE).unlink(missing_ok=True)
 21 |     Path(f"{DB_FILE}.lock").unlink(missing_ok=True)
 22 |     print(f"--- Cleaned up old database file: {DB_FILE} ---")
 23 | 
 24 | if __name__ == "__main__":
 25 |     cleanup()
 26 |     
 27 |     # --- 1. Discover available Ollama models ---
 28 |     print(f"\n--- Step 1: Discovering models from Ollama at {OLLAMA_HOST} ---")
 29 |     try:
 30 |         # Use the class method to get a list of models from the Ollama server
 31 |         available_models = safe_store.SafeStore.list_available_models(
 32 |             vectorizer_name="ollama",
 33 |             host=OLLAMA_HOST # Pass the host to the method
 34 |         )
 35 |         
 36 |         if not available_models:
 37 |             print("\n[ERROR] No models found on the Ollama server.")
 38 |             print("Please make sure Ollama is running and you have pulled at least one model, for example:")
 39 |             print("  ollama pull nomic-embed-text")
 40 |             exit()
 41 | 
 42 |         print("Found available models:")
 43 |         for model in available_models:
 44 |             print(f"  - {model}")
 45 | 
 46 |     except safe_store.VectorizationError as e:
 47 |         print(f"\n[ERROR] Could not connect to the Ollama server: {e}")
 48 |         print("Please ensure your Ollama server is running and accessible.")
 49 |         exit()
 50 |     except Exception as e:
 51 |         print(f"\n[ERROR] An unexpected error occurred: {e}")
 52 |         exit()
 53 | 
 54 |     # --- 2. Select a model and configure the store ---
 55 |     print("\n--- Step 2: Selecting a model ---")
 56 |     # For this example, we'll just pick the first model from the list.
 57 |     # In a real application, you might let the user choose.
 58 |     selected_model = available_models[0]
 59 |     print(f"Selected model: {selected_model}")
 60 | 
 61 |     # Prepare the configuration for the SafeStore instance
 62 |     vectorizer_name = "ollama"
 63 |     vectorizer_config = {
 64 |         "model": selected_model,
 65 |         "host": OLLAMA_HOST
 66 |     }
 67 | 
 68 |     # --- 3. Initialize SafeStore with the selected model ---
 69 |     print("\n--- Step 3: Initializing SafeStore ---")
 70 |     store = safe_store.SafeStore(
 71 |         db_path=DB_FILE,
 72 |         vectorizer_name=vectorizer_name,
 73 |         vectorizer_config=vectorizer_config,
 74 |         log_level=safe_store.LogLevel.INFO
 75 |     )
 76 |     print("SafeStore initialized successfully.")
 77 | 
 78 |     # --- 4. Use the store to add and query text ---
 79 |     print("\n--- Step 4: Adding text and querying ---")
 80 |     with store:
 81 |         # Add some sample text
 82 |         store.add_text(
 83 |             unique_id="tech-report-01",
 84 |             text="The new quantum processor shows a 200% performance increase in benchmark tests."
 85 |         )
 86 |         store.add_text(
 87 |             unique_id="finance-summary-01",
 88 |             text="Quarterly earnings are up by 15%, driven by the new hardware division."
 89 |         )
 90 |         print("Added two text entries to the store.")
 91 | 
 92 |         # Perform a query
 93 |         query_text = "What were the results of the processor benchmarks?"
 94 |         print(f"\nQuerying for: '{query_text}'")
 95 |         results = store.query(query_text, top_k=1)
 96 | 
 97 |         if results:
 98 |             result = results[0]
 99 |             print(f"Found a relevant chunk with {result['similarity_percent']:.2f}% similarity:")
100 |             print(f" -> Text: '{result['chunk_text']}'")
101 |         else:
102 |             print("No relevant results found for the query.")
103 |             
104 |     print("\n--- Example Finished ---")


--------------------------------------------------------------------------------
/examples/custom_logging.py:
--------------------------------------------------------------------------------
  1 | # examples/custom_logging.py
  2 | """
  3 | Demonstrates how to configure ascii_colors globally to customize
  4 | safe_store's logging output (and any other ascii_colors usage).
  5 | """
  6 | import safe_store
  7 | from ascii_colors import ASCIIColors, LogLevel, FileHandler, Formatter
  8 | from pathlib import Path
  9 | import shutil
 10 | 
 11 | # --- Configuration ---
 12 | DB_FILE = "custom_log_store.db"
 13 | LOG_FILE = "safe_store_custom.log"
 14 | DOC_DIR = Path("temp_docs_custom_log")
 15 | 
 16 | # --- Helper Functions ---
 17 | def print_header(title):
 18 |     print("\n" + "="*10 + f" {title} " + "="*10)
 19 | 
 20 | def cleanup():
 21 |     print_header("Cleaning Up")
 22 |     db_path = Path(DB_FILE)
 23 |     log_path = Path(LOG_FILE)
 24 |     lock_path = Path(f"{DB_FILE}.lock")
 25 |     wal_path = Path(f"{DB_FILE}-wal")
 26 |     shm_path = Path(f"{DB_FILE}-shm")
 27 | 
 28 |     if DOC_DIR.exists(): shutil.rmtree(DOC_DIR)
 29 |     if db_path.exists(): db_path.unlink()
 30 |     if log_path.exists(): log_path.unlink()
 31 |     if lock_path.exists(): lock_path.unlink(missing_ok=True)
 32 |     if wal_path.exists(): wal_path.unlink(missing_ok=True)
 33 |     if shm_path.exists(): shm_path.unlink(missing_ok=True)
 34 |     print("- Cleanup complete.")
 35 | 
 36 | # --- Main Script ---
 37 | if __name__ == "__main__":
 38 |     cleanup() # Start fresh
 39 | 
 40 |     print_header("Configuring Global Logging")
 41 | 
 42 |     # 1. Set the global minimum log level (e.g., show DEBUG messages)
 43 |     ASCIIColors.set_log_level(LogLevel.DEBUG)
 44 |     print(f"- Global log level set to: {LogLevel.DEBUG.name}")
 45 | 
 46 |     # 2. Create a file handler to log messages to a file
 47 |     file_handler = FileHandler(LOG_FILE, encoding='utf-8')
 48 |     print(f"- Configured file logging to: {LOG_FILE}")
 49 | 
 50 |     # 3. Define a format for the file logger
 51 |     # Example format: Timestamp - Level Name - Message
 52 |     file_formatter = Formatter(
 53 |         "%(asctime)s [%(levelname)-8s] %(message)s",
 54 |         datefmt="%Y-%m-%d %H:%M:%S"
 55 |     )
 56 |     file_handler.setFormatter(file_formatter)
 57 |     print(f"- Set custom format for file logger.")
 58 | 
 59 |     # 4. Add the configured file handler to ascii_colors
 60 |     ASCIIColors.add_handler(file_handler)
 61 |     print(f"- Added file handler globally.")
 62 | 
 63 |     # Optional: Remove the default console handler if you *only* want file logging
 64 |     # default_console_handler = ASCIIColors.get_default_handler()
 65 |     # if default_console_handler:
 66 |     #    ASCIIColors.remove_handler(default_console_handler)
 67 |     #    print("- Removed default console handler.")
 68 |     # else:
 69 |     #    print("- Default console handler not found or already removed.")
 70 |     print("- Default console handler remains active (logs will go to console AND file).")
 71 | 
 72 | 
 73 |     # --- Initialize and use safe_store ---
 74 |     # It will now use the global logging configuration we just set.
 75 |     print_header("Initializing and Using safe_store")
 76 |     print("safe_store actions will now be logged according to the global settings.")
 77 |     print(f"Check the console output AND the '{LOG_FILE}' file.")
 78 | 
 79 |     try:
 80 |         store = safe_store.SafeStore(DB_FILE) # Uses global log level (DEBUG)
 81 | 
 82 |         # Prepare a sample document
 83 |         DOC_DIR.mkdir(exist_ok=True)
 84 |         doc_path = DOC_DIR / "logging_test.txt"
 85 |         doc_path.write_text("This is a test document for custom logging.", encoding='utf-8')
 86 | 
 87 |         with store:
 88 |             # Add the document - DEBUG messages should appear in the log file
 89 |             store.add_document(doc_path, vectorizer_name="st:all-MiniLM-L6-v2")
 90 | 
 91 |             # Perform a query
 92 |             results = store.query("custom logging test")
 93 |             print("\n--- Query Results ---")
 94 |             if results:
 95 |                 print(f"Found {len(results)} result(s).")
 96 |             else:
 97 |                 print("No results found.")
 98 | 
 99 |     except safe_store.ConfigurationError as e:
100 |          print(f"\n[ERROR] Missing dependency: {e}")
101 |          print("Please install required extras (e.g., pip install safe_store[sentence-transformers])")
102 |     except Exception as e:
103 |         print(f"\n[ERROR] An unexpected error occurred: {e.__class__.__name__}: {e}")
104 |     finally:
105 |          print("\n--- End of Script ---")
106 |          print(f"Review console output and '{LOG_FILE}' for detailed logs.")
107 | 
108 | 


--------------------------------------------------------------------------------
/safe_store/search/similarity.py:
--------------------------------------------------------------------------------
 1 | # safe_store/search/similarity.py
 2 | import numpy as np
 3 | from ascii_colors import ASCIIColors
 4 | from typing import Union
 5 | 
 6 | # Type hint for vectors
 7 | VectorInput = Union[np.ndarray, list[float]] # Allow lists as input for query? No, enforce ndarray.
 8 | Vector1D = np.ndarray # Shape (D,)
 9 | Matrix2D = np.ndarray # Shape (N, D)
10 | 
11 | def cosine_similarity(query_vector: Vector1D, vectors: Matrix2D) -> np.ndarray:
12 |     """
13 |     Calculates cosine similarity between a single query vector and a matrix of vectors.
14 | 
15 |     Handles normalization and potential zero vectors gracefully.
16 | 
17 |     Args:
18 |         query_vector: A 1D NumPy array representing the query vector (shape D).
19 |         vectors: A 2D NumPy array where each row is a vector to compare against
20 |                  (shape N, D). Can also handle the case where vectors is 1D
21 |                  (shape D) representing a single comparison vector, by reshaping it.
22 | 
23 |     Returns:
24 |         A 1D NumPy array of shape (N,) containing the cosine similarity scores,
25 |         where each score is between -1.0 and 1.0.
26 | 
27 |     Raises:
28 |         TypeError: If inputs are not NumPy arrays.
29 |         ValueError: If input shapes are incompatible (e.g., query is not 1D,
30 |                     matrix is not 1D or 2D, or dimensions mismatch).
31 |     """
32 |     if not isinstance(query_vector, np.ndarray) or not isinstance(vectors, np.ndarray):
33 |         raise TypeError("Input query_vector and vectors must be NumPy arrays.")
34 | 
35 |     # Validate query_vector shape
36 |     if query_vector.ndim != 1:
37 |         raise ValueError(f"Query vector must be 1D, but got shape {query_vector.shape}")
38 | 
39 |     # Validate and potentially reshape vectors matrix
40 |     if vectors.ndim == 1:
41 |         # Allow comparing query to a single vector passed as 1D array
42 |         if query_vector.shape[0] == vectors.shape[0]:
43 |             vectors = vectors.reshape(1, -1) # Reshape to (1, D)
44 |             ASCIIColors.debug("Reshaped 1D input 'vectors' to 2D for single vector comparison.")
45 |         else:
46 |             raise ValueError(
47 |                 f"If 'vectors' is 1D, its dimension ({vectors.shape[0]}) must match "
48 |                 f"query_vector dimension ({query_vector.shape[0]})"
49 |             )
50 |     elif vectors.ndim != 2:
51 |         raise ValueError(f"Input 'vectors' must be a 1D or 2D array, but got shape {vectors.shape}")
52 | 
53 |     # Dimension compatibility check
54 |     if query_vector.shape[0] != vectors.shape[1]:
55 |         raise ValueError(
56 |             f"Query vector dimension ({query_vector.shape[0]}) must match "
57 |             f"the dimension of vectors in the matrix ({vectors.shape[1]})"
58 |         )
59 | 
60 |     num_vectors = vectors.shape[0]
61 |     if num_vectors == 0:
62 |          ASCIIColors.debug("Input 'vectors' matrix is empty, returning empty similarity array.")
63 |          return np.array([], dtype=query_vector.dtype) # Return empty array of appropriate type
64 | 
65 |     ASCIIColors.debug(f"Calculating cosine similarity: query_shape={query_vector.shape}, matrix_shape={vectors.shape}")
66 | 
67 |     # Calculate norms, adding epsilon for numerical stability and avoiding zero division
68 |     epsilon = np.finfo(query_vector.dtype).eps # Use machine epsilon for the data type
69 |     query_norm = np.linalg.norm(query_vector)
70 |     vectors_norm = np.linalg.norm(vectors, axis=1) # Norm of each row vector
71 | 
72 |     # Handle potential zero vectors by replacing norm with epsilon
73 |     query_norm_safe = query_norm if query_norm > epsilon else epsilon
74 |     vectors_norm_safe = np.where(vectors_norm > epsilon, vectors_norm, epsilon)
75 | 
76 |     # Normalize vectors
77 |     # Using np.divide with 'out' and 'where' could be slightly more robust, but direct division is common
78 |     norm_query = query_vector / query_norm_safe
79 |     # Use broadcasting for matrix normalization: vectors_norm_safe[:, np.newaxis] ensures (N, 1) shape
80 |     norm_vectors = vectors / vectors_norm_safe[:, np.newaxis]
81 | 
82 |     # Calculate dot product between normalized matrix rows and the normalized query vector
83 |     # Result is (N, D) dot (D,) -> (N,)
84 |     similarity_scores = np.dot(norm_vectors, norm_query)
85 | 
86 |     # Clip scores to be strictly within [-1, 1] due to potential floating point inaccuracies
87 |     similarity_scores = np.clip(similarity_scores, -1.0, 1.0)
88 | 
89 |     ASCIIColors.debug(f"Similarity calculation complete. Output shape: {similarity_scores.shape}")
90 |     return similarity_scores


--------------------------------------------------------------------------------
/docs/encryption.rst:
--------------------------------------------------------------------------------
 1 | ==========
 2 | Encryption
 3 | ==========
 4 | 
 5 | ``safe_store`` provides optional encryption at rest for the text content of document chunks stored in the database. This helps protect sensitive information if the database file itself is exposed.
 6 | 
 7 | How it Works
 8 | ------------
 9 | 
10 | *   **Algorithm:** Uses Fernet symmetric authenticated cryptography from the `cryptography <https://cryptography.io/en/latest/fernet/>`_ library. Fernet uses AES-128 in CBC mode with PKCS7 padding for encryption and HMAC with SHA256 for authentication.
11 | *   **Key Derivation:** When you provide an ``encryption_key`` (password) during ``safe_store`` initialization, a strong 256-bit encryption key suitable for Fernet is derived using PBKDF2 HMAC SHA256.
12 |     *   **Salt:** For simplicity within ``safe_store``, a **fixed, hardcoded salt** is used during key derivation. This means the same password will always produce the same encryption key. See the Security Considerations below.
13 | *   **Encryption Target:** Only the ``chunk_text`` stored in the ``chunks`` table is encrypted. Other data like document paths, metadata, vectorizer parameters, and the vectors themselves are **not** encrypted by this feature.
14 | *   **Automatic Handling:** Encryption and decryption are handled automatically during ``add_document`` and ``query`` operations if the ``safe_store`` instance was initialized with the correct ``encryption_key``.
15 | 
16 | Enabling Encryption
17 | -------------------
18 | 
19 | 1.  **Install Dependency:** Ensure the ``cryptography`` library is installed:
20 |     .. code-block:: bash
21 | 
22 |        pip install safe_store[encryption]
23 |        # or
24 |        pip install safe_store[all]
25 | 
26 | 2.  **Provide Key on Init:** Pass your chosen password (key) to the ``encryption_key`` parameter when creating the ``safe_store`` instance:
27 | 
28 |     .. code-block:: python
29 | 
30 |        import safe_store
31 | 
32 |        my_password = "your-very-strong-password-here" # Keep this safe!
33 | 
34 |        store = safe_store.SafeStore(
35 |            "encrypted_store.db",
36 |            encryption_key=my_password
37 |        )
38 | 
39 |        # Now, when you add documents, chunk text will be encrypted
40 |        with store:
41 |            store.add_document("path/to/sensitive_doc.txt")
42 | 
43 |            # When you query, chunk text will be automatically decrypted
44 |            results = store.query("search term")
45 |            print(results[0]['chunk_text']) # Prints decrypted text
46 | 
47 | Usage Notes
48 | -----------
49 | 
50 | *   **Consistency:** You **must** use the exact same ``encryption_key`` every time you open a specific database file that contains encrypted data.
51 | *   **Querying without Key:** If you open an encrypted database without providing the key (or with the wrong key), query results will contain placeholder text like ``[Encrypted - Key Unavailable]`` or ``[Encrypted - Decryption Failed]`` instead of the actual chunk text.
52 | *   **Adding Vectorizations:** If you use ``add_vectorization`` for a method like TF-IDF that requires fitting on existing text, ``safe_store`` will attempt to decrypt the necessary chunks using the provided key. If the key is missing or incorrect, the operation will fail.
53 | *   **Key Management:** **You are solely responsible for managing your ``encryption_key`` securely.** If you lose the key, the encrypted data in the database will be permanently unrecoverable. Do not hardcode keys directly in your source code in production environments. Consider using environment variables, configuration files with appropriate permissions, or dedicated secrets management systems.
54 | 
55 | Security Considerations
56 | -----------------------
57 | 
58 | *   **Fixed Salt:** As mentioned, ``safe_store`` currently uses a fixed salt for PBKDF2 key derivation for simplicity. This is less secure than using a unique, randomly generated salt for each password/database, as it doesn't fully protect against precomputed rainbow table attacks if the fixed salt becomes known. For high-security requirements, this implementation might not be sufficient.
59 | *   **Metadata Not Encrypted:** Document paths, metadata, and vector information remain unencrypted. Ensure no sensitive information is placed in document metadata if the database file requires protection.
60 | *   **Scope:** Encryption only applies to chunk text *at rest* in the SQLite file. Data is decrypted in memory during processing (e.g., querying).
61 | 
62 | This feature provides a reasonable layer of protection against casual inspection of the database file but relies heavily on the security of your chosen ``encryption_key`` and understanding its limitations.
63 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # UV
 98 | #   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #uv.lock
102 | 
103 | # poetry
104 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
106 | #   commonly ignored for libraries.
107 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108 | #poetry.lock
109 | 
110 | # pdm
111 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112 | #pdm.lock
113 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114 | #   in version control.
115 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
116 | .pdm.toml
117 | .pdm-python
118 | .pdm-build/
119 | 
120 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
121 | __pypackages__/
122 | 
123 | # Celery stuff
124 | celerybeat-schedule
125 | celerybeat.pid
126 | 
127 | # SageMath parsed files
128 | *.sage.py
129 | 
130 | # Environments
131 | .env
132 | .venv
133 | env/
134 | venv/
135 | ENV/
136 | env.bak/
137 | venv.bak/
138 | 
139 | # Spyder project settings
140 | .spyderproject
141 | .spyproject
142 | 
143 | # Rope project settings
144 | .ropeproject
145 | 
146 | # mkdocs documentation
147 | /site
148 | 
149 | # mypy
150 | .mypy_cache/
151 | .dmypy.json
152 | dmypy.json
153 | 
154 | # Pyre type checker
155 | .pyre/
156 | 
157 | # pytype static type analyzer
158 | .pytype/
159 | 
160 | # Cython debug symbols
161 | cython_debug/
162 | 
163 | # PyCharm
164 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
165 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
167 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
168 | #.idea/
169 | 
170 | # Ruff stuff:
171 | .ruff_cache/
172 | 
173 | # PyPI configuration file
174 | .pypirc
175 | config.toml
176 | 
177 | .vscode
178 | config.toml
179 | temp_uploaded_files_webui
180 | webui_safestore_docs
181 | temp_docs_basic
182 | temp_docs_graph_example
183 | 
184 | *.db
185 | *.db-shm
186 | *.db-wal
187 | *.lock
188 | 
189 | 
190 | benchmark_output
191 | # Lollms vscoder
192 | .lollms


--------------------------------------------------------------------------------
/tests/test_store_phase4.py:
--------------------------------------------------------------------------------
  1 | # tests/test_store_phase4.py
  2 | import pytest
  3 | import sqlite3
  4 | from pathlib import Path
  5 | from unittest.mock import patch, MagicMock, call
  6 | import re
  7 | 
  8 | # Import exceptions and modules
  9 | from safe_store import store as safe_store_store_module
 10 | from safe_store import SafeStore, LogLevel
 11 | from safe_store.core import db
 12 | from safe_store.core.exceptions import ConcurrencyError, FileHandlingError, ConfigurationError, SafeStoreError
 13 | 
 14 | # Import filelock components
 15 | from filelock import Timeout, FileLock
 16 | 
 17 | # --- FIX: Define availability check locally ---
 18 | # Check for parsing libraries availability within this module
 19 | try:
 20 |     import pypdf
 21 |     import docx
 22 |     import bs4
 23 |     import lxml
 24 |     PARSING_LIBS_AVAILABLE = True
 25 | except ImportError:
 26 |     PARSING_LIBS_AVAILABLE = False
 27 | 
 28 | # Helper for log checks
 29 | def assert_log_call_containing(mock_logger, expected_substring):
 30 |     """Checks if any call to the mock logger contained the substring."""
 31 |     found = False
 32 |     # Check positional args
 33 |     for call_args in mock_logger.call_args_list:
 34 |         args, kwargs = call_args
 35 |         if args and isinstance(args[0], str) and expected_substring in args[0]:
 36 |             found = True
 37 |             break
 38 |     # Check method calls if not found in direct calls (needed for specific methods like info, debug)
 39 |     if not found:
 40 |         for method_call in mock_logger.method_calls:
 41 |             call_name, args, kwargs = method_call
 42 |             if args and isinstance(args[0], str) and expected_substring in args[0]:
 43 |                 found = True
 44 |                 break
 45 |     assert found, f"Expected log call containing '{expected_substring}' not found in {mock_logger.call_args_list} or {mock_logger.method_calls}"
 46 | 
 47 | # +++ FIX: Added mock_db_colors to decorator and arguments +++
 48 | @patch('safe_store.core.db.ASCIIColors')
 49 | @patch('safe_store.store.ASCIIColors')
 50 | @patch('safe_store.vectorization.manager.ASCIIColors')
 51 | def test_store_close_and_context_manager(mock_manager_colors, mock_store_colors, mock_db_colors, temp_db_path):
 52 |     """Test explicit close() and context manager usage."""
 53 |     # mock_db_colors is now available
 54 |     store = SafeStore(temp_db_path, log_level=LogLevel.DEBUG)
 55 |     assert store.conn is not None
 56 |     assert not store._is_closed
 57 |     # Check initial connection log
 58 |     assert_log_call_containing(mock_db_colors.debug, "Connected to database:")
 59 | 
 60 |     # --- Ensure cache is populated before first close ---
 61 |     try:
 62 |         with store:
 63 |              _ = store.vectorizer_manager.get_vectorizer(store.DEFAULT_VECTORIZER, store.conn, None)
 64 |     except Exception as e:
 65 |          if not isinstance(e, ConfigurationError):
 66 |               print(f"Warning: Error populating cache in test: {e}")
 67 |          pass
 68 | 
 69 |     store.close()
 70 |     assert store.conn is None
 71 |     assert store._is_closed
 72 |     assert_log_call_containing(mock_store_colors.info, "safe_store connection closed.")
 73 |     try:
 74 |         assert_log_call_containing(mock_manager_colors.debug, "Cleared vectorizer manager cache")
 75 |     except AssertionError:
 76 |         print("Cache clear log not found, cache might have been empty.")
 77 | 
 78 | 
 79 |     # Test context manager re-opening and closing
 80 |     mock_store_colors.reset_mock()
 81 |     mock_manager_colors.reset_mock()
 82 |     mock_db_colors.reset_mock() # Reset this mock too
 83 |     with SafeStore(temp_db_path, log_level=LogLevel.DEBUG) as store_ctx:
 84 |         assert store_ctx.conn is not None
 85 |         assert not store_ctx._is_closed
 86 |         # +++ FIX: Check the correct mock for the connection log +++
 87 |         assert_log_call_containing(mock_db_colors.debug, "Connected to database:")
 88 | 
 89 |         # --- Ensure cache is populated before second close (via exit) ---
 90 |         try:
 91 |              _ = store_ctx.vectorizer_manager.get_vectorizer(store.DEFAULT_VECTORIZER, store_ctx.conn, None)
 92 |         except Exception as e:
 93 |              if not isinstance(e, ConfigurationError):
 94 |                   print(f"Warning: Error populating cache in test context: {e}")
 95 |              pass
 96 | 
 97 |     # Check logs after exiting context
 98 |     assert store_ctx.conn is None
 99 |     assert store_ctx._is_closed
100 |     assert_log_call_containing(mock_store_colors.debug, "safe_store context closed cleanly.")
101 |     try:
102 |         assert_log_call_containing(mock_manager_colors.debug, "Cleared vectorizer manager cache")
103 |     except AssertionError:
104 |         print("Cache clear log not found after context exit, cache might have been empty.")
105 | 
106 | 
107 | def test_list_documents_empty(safe_store_instance: SafeStore):
108 |     """Test listing documents from an empty store."""
109 |     with safe_store_instance as store:
110 |         docs = store.list_documents()
111 |     assert docs == []
112 | 
113 | def test_list_vectorization_methods_empty(safe_store_instance: SafeStore):
114 |     """Test listing methods from an empty store."""
115 |     with safe_store_instance as store:
116 |         methods = store.list_vectorization_methods()
117 |     assert methods == []
118 | 
119 | 
120 | def test_list_documents_populated(populated_store: SafeStore):
121 |     """Test listing documents after adding some."""
122 |     with populated_store as store:
123 |         docs = store.list_documents()
124 | 
125 |     assert len(docs) == 2
126 |     doc1_info = next((d for d in docs if "sample.txt" in d["file_path"]), None)
127 |     doc2_info = next((d for d in docs if "sample2.txt" in d["file_path"]), None)
128 |     assert doc1_info is not None; assert doc2_info is not None
129 |     assert doc1_info["doc_id"] is not None; assert isinstance(doc1_info["file_path"], str)
130 |     assert doc1_info["file_hash"] is not None; assert doc1_info["added_timestamp"] is not None
131 |     assert doc1_info["metadata"] is None
132 | 
133 | 
134 | def test_list_vectorization_methods_populated(populated_store: SafeStore):
135 |     """Test listing methods after adding documents."""
136 |     with populated_store as store:
137 |         methods = store.list_vectorization_methods()
138 | 
139 |     st_method = next((m for m in methods if m["method_name"] == store.DEFAULT_VECTORIZER), None)
140 |     assert st_method is not None, f"Default vectorizer {store.DEFAULT_VECTORIZER} not found in listed methods."
141 | 
142 |     assert st_method["method_type"] == "sentence_transformer"
143 |     assert st_method["vector_dim"] == 384 # Mocked or real dimension
144 |     assert st_method["vector_dtype"] == "float32"
145 |     assert st_method["params"] == {}, f"Expected params to be {{}}, got {st_method['params']}"
146 |     assert len(methods) == 1, f"Expected 1 method, found {len(methods)}"


--------------------------------------------------------------------------------
/safe_store/vectorization/manager.py:
--------------------------------------------------------------------------------
  1 | # safe_store/vectorization/manager.py
  2 | import json
  3 | import yaml
  4 | import pipmaster as pm
  5 | from typing import Tuple, Optional, Dict, Any, List
  6 | from pathlib import Path
  7 | 
  8 | from ..core.exceptions import ConfigurationError, VectorizationError
  9 | from .base import BaseVectorizer
 10 | from ascii_colors import ASCIIColors
 11 | from .utils import load_vectorizer_module
 12 | 
 13 | class VectorizationManager:
 14 |     """
 15 |     Manages and creates vectorizer instances from built-in or custom locations.
 16 |     Also provides methods to discover available vectorizers and their configurations.
 17 |     """
 18 | 
 19 |     def __init__(self, cache_folder: Optional[str] = None, custom_vectorizers_path: Optional[str] = None):
 20 |         pm.ensure_packages(["PyYAML"])
 21 |         self.cache_folder = Path(cache_folder) if cache_folder else None
 22 |         if self.cache_folder:
 23 |             self.cache_folder.mkdir(parents=True, exist_ok=True)
 24 |         
 25 |         self.custom_vectorizers_path = custom_vectorizers_path
 26 |         self._cache: Dict[str, BaseVectorizer] = {}
 27 | 
 28 |     @staticmethod
 29 |     def _create_unique_name(vectorizer_name: str, config: Optional[Dict[str, Any]]) -> str:
 30 |         if not config:
 31 |             return vectorizer_name
 32 |         config_str = json.dumps(config, sort_keys=True, separators=(',', ':'))
 33 |         return f"{vectorizer_name}:{config_str}"
 34 | 
 35 | 
 36 |     @staticmethod
 37 |     def _create_vectorizer_ascii_infos(vectorizer_name: str, config: Optional[Dict[str, Any]]) -> str:
 38 |         lines = []
 39 | 
 40 |         lines.append(ASCIIColors.bold(ASCIIColors.magenta("════════════════════════════════════════"), emit=False))
 41 |         lines.append(ASCIIColors.bold(ASCIIColors.magenta("  VECTORISER INFORMATION"), emit=False))
 42 |         lines.append(ASCIIColors.bold(ASCIIColors.magenta("════════════════════════════════════════", emit=False), emit=False))
 43 |         lines.append("")
 44 | 
 45 |         lines.append(
 46 |             f"{ASCIIColors.cyan('Name')} : "
 47 |             f"{ASCIIColors.bold(ASCIIColors.green(vectorizer_name, emit=False),emit=False)}"
 48 |         )
 49 | 
 50 |         if config:
 51 |             lines.append("")
 52 |             lines.append(ASCIIColors.yellow("Configuration:"))
 53 |             lines.append(ASCIIColors.orange("──────────────",emit=False))
 54 | 
 55 |             pretty_config = json.dumps(config, indent=2, sort_keys=True)
 56 |             for line in pretty_config.splitlines():
 57 |                 lines.append(ASCIIColors.blue(line,emit=False))
 58 |         else:
 59 |             lines.append("")
 60 |             lines.append(ASCIIColors.red("No configuration provided.",emit=False))
 61 | 
 62 |         lines.append("")
 63 |         lines.append(ASCIIColors.bold(ASCIIColors.magenta("════════════════════════════════════════",emit=False),emit=False))
 64 | 
 65 |         return "\n".join(lines)
 66 | 
 67 |     def list_vectorizers(self) -> List[Dict[str, Any]]:
 68 |         """Scans for available vectorizers and returns their metadata from description.yaml."""
 69 |         vectorizers = []
 70 |         
 71 |         # Scan built-in methods directory
 72 |         methods_path = Path(__file__).parent / "methods"
 73 |         for p in methods_path.iterdir():
 74 |             if p.is_dir() and (p / "description.yaml").exists():
 75 |                 with open(p / "description.yaml", 'r', encoding='utf-8') as f:
 76 |                     try:
 77 |                         data = yaml.safe_load(f)
 78 |                         data['name'] = p.name  # Add the folder name as the identifier
 79 |                         vectorizers.append(data)
 80 |                     except yaml.YAMLError:
 81 |                         ASCIIColors.warning(f"Could not parse description.yaml for vectorizer '{p.name}'")
 82 |         
 83 |         # Scan custom path if provided
 84 |         if self.custom_vectorizers_path:
 85 |             custom_path = Path(self.custom_vectorizers_path)
 86 |             if custom_path.is_dir():
 87 |                  for p in custom_path.iterdir():
 88 |                     if p.is_dir() and (p / "description.yaml").exists():
 89 |                         with open(p / "description.yaml", 'r', encoding='utf-8') as f:
 90 |                             try:
 91 |                                 data = yaml.safe_load(f)
 92 |                                 data['name'] = p.name
 93 |                                 data['is_custom'] = True
 94 |                                 vectorizers.append(data)
 95 |                             except yaml.YAMLError:
 96 |                                 ASCIIColors.warning(f"Could not parse description.yaml for custom vectorizer '{p.name}'")
 97 | 
 98 |         return vectorizers
 99 | 
100 |     def get_vectorizer(
101 |         self,
102 |         vectorizer_name: str,
103 |         vectorizer_config: Optional[Dict[str, Any]],
104 |     ) -> BaseVectorizer:
105 |         # Fix: Add an alias for 'st' to point to the correct folder.
106 |         # Note: The folder 'sentense_transformer' has a typo and should ideally be 'sentence_transformer'.
107 |         if vectorizer_name == "st":
108 |             vectorizer_name = "sentense_transformer"
109 | 
110 |         unique_name = self._create_unique_name(vectorizer_name, vectorizer_config)
111 | 
112 |         if unique_name in self._cache:
113 |             return self._cache[unique_name]
114 | 
115 |         ASCIIColors.info(f"Initializing vectorizer:\n{self._create_vectorizer_ascii_infos(vectorizer_name, vectorizer_config)}")
116 |         config_for_init = vectorizer_config or {}
117 | 
118 |         try:
119 |             module = load_vectorizer_module(vectorizer_name, self.custom_vectorizers_path)
120 |             
121 |             # The class name is now fetched from the module itself
122 |             if not hasattr(module, 'class_name'):
123 |                 raise ConfigurationError(f"Vectorizer module '{vectorizer_name}' does not define a 'class_name' variable.")
124 |             
125 |             VectorizerClass = getattr(module, module.class_name)
126 |             
127 |             if not issubclass(VectorizerClass, BaseVectorizer):
128 |                 raise ConfigurationError(f"Class '{module.class_name}' does not inherit from BaseVectorizer.")
129 | 
130 |             vectorizer_instance = VectorizerClass(model_config=config_for_init, cache_folder=self.cache_folder)
131 | 
132 |         except (ImportError, FileNotFoundError) as e:
133 |             raise ConfigurationError(f"Could not find or load vectorizer module for '{vectorizer_name}'.") from e
134 |         except Exception as e:
135 |             raise VectorizationError(f"Failed to initialize '{vectorizer_name}' vectorizer: {e}") from e
136 | 
137 |         self._cache[unique_name] = vectorizer_instance
138 |         return vectorizer_instance
139 | 
140 |     def clear_cache(self) -> None:
141 |         self._cache.clear()


--------------------------------------------------------------------------------
/safe_store/vectorization/methods/ollama/__init__.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from typing import List, Optional, Dict, Any
  3 | from ...base import BaseVectorizer
  4 | from ....core.exceptions import ConfigurationError, VectorizationError
  5 | from ascii_colors import ASCIIColors, trace_exception
  6 | 
  7 | # each vectorizer must have a class name variable to be identified
  8 | class_name="OllamaVectorizer"
  9 | 
 10 | # Attempt import of ollama and related error types, handle gracefully
 11 | try:
 12 |     import pipmaster as pm
 13 |     pm.ensure_packages(["ollama"])
 14 |     import ollama
 15 |     _OllamaResponseError = ollama.ResponseError
 16 |     _OllamaRequestError = ollama.RequestError
 17 |     _OLLAMA_AVAILABLE = True
 18 | except ImportError:
 19 |     _OLLAMA_AVAILABLE = False
 20 |     ollama = None
 21 |     class _OllamaResponseError(Exception): pass
 22 |     class _OllamaRequestError(Exception): pass
 23 | 
 24 | def list_available_models(**kwargs) -> List[str]:
 25 |     """Dynamically lists models from a running Ollama server."""
 26 |     if not _OLLAMA_AVAILABLE:
 27 |         raise ConfigurationError("Ollama support is not installed. Please run: pip install safe_store[ollama]")
 28 |     
 29 |     try:
 30 |         response = ollama.list()
 31 |         # The structure from ollama.list() is a dict with a 'models' key
 32 |         # which is a list of dicts, each with a 'name' key.
 33 |         return [model.model for model in response.models]
 34 |     except ollama.RequestError as e:
 35 |         raise VectorizationError("Could not connect to the Ollama server. Please ensure it is running.") from e
 36 |     except Exception as e:
 37 |         raise VectorizationError(f"An unexpected error occurred while listing Ollama models: {e}") from e
 38 | 
 39 | class OllamaVectorizer(BaseVectorizer):
 40 |     """
 41 |     Vectorizes text using models hosted by an Ollama instance.
 42 |     Requires the `ollama` Python library to be installed.
 43 |     """
 44 | 
 45 |     def __init__(self,
 46 |                  model_config: Dict[str, Any],
 47 |                  **kwargs):
 48 |         """
 49 |         Initializes the OllamaVectorizer.
 50 | 
 51 |         Args:
 52 |             model_config: A dictionary containing the vectorizer's configuration.
 53 |                           - "model" (str): Mandatory. The name of the model to use.
 54 |                           - "host" (str): Optional. The URL of the Ollama server.
 55 |                             Defaults to http://localhost:11434 or OLLAMA_HOST env var.
 56 | 
 57 |         Raises:
 58 |             ConfigurationError: If 'ollama' is not installed or config is invalid.
 59 |             VectorizationError: If connection to Ollama fails or the model is invalid.
 60 |         """
 61 |         super().__init__(vectorizer_name="ollama")
 62 |         if not _OLLAMA_AVAILABLE or ollama is None:
 63 |             raise ConfigurationError("OllamaVectorizer requires the 'ollama' library. Install with: pip install safe_store[ollama]")
 64 | 
 65 |         if not isinstance(model_config, dict) or "model" not in model_config:
 66 |             raise ConfigurationError("Ollama vectorizer config must be a dictionary with a 'model' key.")
 67 | 
 68 |         self.model_name: str = model_config["model"]
 69 |         self.host: Optional[str] = model_config.get("host") # Let the client handle default
 70 | 
 71 |         ASCIIColors.info(f"Initializing Ollama client. Model: {self.model_name}, Host: {self.host or 'default'}")
 72 |         try:
 73 |             self.client: ollama.Client = ollama.Client(host=self.host.strip())
 74 | 
 75 |             # Test connection and get embedding dimension
 76 |             test_prompt = "hello world"
 77 |             response = self.client.embeddings(model=self.model_name, prompt=test_prompt)
 78 |             
 79 |             embedding = response.get("embedding")
 80 | 
 81 |             if not isinstance(embedding, list) or not embedding:
 82 |                 raise VectorizationError(f"Ollama model '{self.model_name}' did not return a valid embedding.")
 83 | 
 84 |             self._dim = len(embedding)
 85 |             if self._dim == 0:
 86 |                 raise VectorizationError(f"Ollama model '{self.model_name}' returned a zero-dimension embedding.")
 87 | 
 88 |             self._dtype = np.dtype(np.float32)
 89 |             ASCIIColors.info(f"Ollama model '{self.model_name}' ready. Dimension: {self._dim}")
 90 | 
 91 |         except _OllamaResponseError as e:
 92 |             trace_exception(e)
 93 |             raise VectorizationError(f"Ollama API error for model '{self.model_name}': {e.error}") from e
 94 |         except _OllamaRequestError as e:
 95 |             trace_exception(e)
 96 |             raise VectorizationError(f"Ollama request error connecting to host '{self.host or 'default'}': {e}") from e
 97 |         except Exception as e:
 98 |             trace_exception(e)
 99 |             raise VectorizationError(f"Failed to initialize Ollama vectorizer '{self.model_name}': {e}") from e
100 | 
101 |     def vectorize(self, texts: List[str]) -> np.ndarray:
102 |         if not texts:
103 |             return np.empty((0, self.dim), dtype=self.dtype)
104 |         
105 |         embeddings_list = []
106 |         try:
107 |             for i, text in enumerate(texts):
108 |                 if not text.strip():
109 |                     embeddings_list.append(np.zeros(self.dim, dtype=self._dtype))
110 |                     continue
111 | 
112 |                 response = self.client.embeddings(model=self.model_name, prompt=text)
113 |                 embedding_vector = response.get("embedding")
114 | 
115 |                 if not isinstance(embedding_vector, list) or len(embedding_vector) != self.dim:
116 |                     raise VectorizationError(f"Ollama model '{self.model_name}' returned an invalid embedding for text at index {i}.")
117 |                 embeddings_list.append(embedding_vector)
118 | 
119 |             embeddings_array = np.array(embeddings_list, dtype=self._dtype)
120 |             
121 |             if embeddings_array.shape != (len(texts), self.dim):
122 |                  raise VectorizationError(f"Ollama vectorization resulted in unexpected shape {embeddings_array.shape}.")
123 | 
124 |             return embeddings_array
125 | 
126 |         except (_OllamaResponseError, _OllamaRequestError) as e:
127 |             raise VectorizationError(f"Ollama API error during vectorization: {e}") from e
128 |         except Exception as e:
129 |             raise VectorizationError(f"Unexpected error during Ollama vectorization: {e}") from e
130 | 
131 |     @property
132 |     def dim(self) -> int:
133 |         return self._dim
134 | 
135 |     @property
136 |     def dtype(self) -> np.dtype:
137 |         return self._dtype
138 | 
139 |     @staticmethod
140 |     def list_models(**kwargs) -> List[str]:
141 |         """Lists available models from the running Ollama instance."""
142 |         try:
143 |             response = ollama.list()
144 |             # The structure from ollama.list() is a dict with a 'models' key
145 |             # which is a list of dicts, each with a 'name' key.
146 |             return [model.model for model in response.models]
147 |         except ollama.RequestError as e:
148 |             raise VectorizationError("Could not connect to the Ollama server. Please ensure it is running.") from e
149 |         except Exception as e:
150 |             raise VectorizationError(f"An unexpected error occurred while listing Ollama models: {e}") from e
151 | 


--------------------------------------------------------------------------------
/examples/metadata_generation.py:
--------------------------------------------------------------------------------
  1 |     # examples/metadata_generation.py
  2 | import safe_store
  3 | from safe_store import LogLevel
  4 | import pipmaster as pm
  5 | from pathlib import Path
  6 | import shutil
  7 | import json
  8 | 
  9 | # --- Configuration ---
 10 | DB_FILE = "metadata_example.db"
 11 | DOC_DIR = Path("temp_docs_metadata_example")
 12 | ENCRYPTION_KEY = "my-super-secret-key-for-testing" # Use a strong key in production
 13 | 
 14 | 
 15 | BINDING_NAME = "ollama"
 16 | HOST_ADDRESS = "http://localhost:11434"
 17 | MODEL_NAME = "mistral:latest"
 18 | 
 19 | # --- Example Setup ---
 20 | def setup():
 21 |     """Cleans up old files and creates new ones for the example."""
 22 |     print_header("Setting Up Example Environment")
 23 |     # Clean up DB
 24 |     db_path = Path(DB_FILE)
 25 |     paths_to_delete = [db_path, Path(f"{db_path}.lock")]
 26 |     for p in paths_to_delete:
 27 |         p.unlink(missing_ok=True)
 28 |     
 29 |     # Clean up and create doc directory
 30 |     if DOC_DIR.exists():
 31 |         shutil.rmtree(DOC_DIR)
 32 |     DOC_DIR.mkdir(exist_ok=True)
 33 | 
 34 |     # Create a sample document
 35 |     article_content = """
 36 |     The Art of Quantum Computing: A Gentle Introduction
 37 | 
 38 |     Quantum computing represents a fundamental shift in computation. Unlike classical
 39 |     computers that use bits (0s and 1s), quantum computers use qubits, which can
 40 |     exist in a superposition of both 0 and 1 simultaneously. This property, along
 41 |     with entanglement, allows quantum computers to explore a vast number of
 42 |     possibilities at once, promising to solve complex problems in fields like
 43 |     medicine, materials science, and cryptography that are intractable for even the
 44 |     most powerful classical supercomputers. However, building and controlling stable
 45 |     qubits remains a significant engineering challenge due to their sensitivity to
 46 |     environmental noise.
 47 |     """
 48 |     (DOC_DIR / "quantum_intro.txt").write_text(article_content.strip())
 49 |     print(f"- Created sample document in: {DOC_DIR.resolve()}")
 50 |     return DOC_DIR / "quantum_intro.txt"
 51 | 
 52 | def print_header(title: str):
 53 |     print("\n" + "="*20 + f" {title} " + "="*20)
 54 | 
 55 | # --- Metadata Generation with Lollms ---
 56 | def generate_metadata_with_lollms(file_content: str) -> dict:
 57 |     """
 58 |     Uses lollms-client to generate a title and summary for the given text.
 59 |     """
 60 |     print_header("Generating Metadata with LOLLMS")
 61 |     try:
 62 |         pm.ensure_packages(["lollms_client"])
 63 |         from lollms_client import LollmsClient
 64 |         # Make sure you have a lollms-webui instance running with a model loaded.
 65 |         # This example assumes a local instance at the default port.
 66 |         client = LollmsClient(llm_binding_name=BINDING_NAME, llm_binding_config={"host_address": HOST_ADDRESS, "model_name": MODEL_NAME})
 67 |     except Exception as e:
 68 |         print(f"  [SKIP] Could not initialize LollmsClient. Is it installed and running? Error: {e}")
 69 |         return {"error": "LollmsClient not available"}
 70 | 
 71 |     prompt = f"""
 72 |     Analyze the following document and extract a concise title and a one-sentence summary.
 73 |     Your response MUST be in a raw JSON format with "title" and "summary" as keys.
 74 | 
 75 |     Document:
 76 |     ---
 77 |     {file_content}
 78 |     ---
 79 | 
 80 |     JSON Response:
 81 |     """
 82 |     
 83 |     print("  - Sending prompt to LLM for metadata extraction...")
 84 |     try:
 85 |         response = client.generate_text(prompt, max_new_tokens=150)
 86 |         print("  - Received response from LLM.")
 87 |         # The response should be a JSON string, let's parse it
 88 |         metadata = json.loads(response)
 89 |         print(f"  - Successfully parsed metadata: {metadata}")
 90 |         return metadata
 91 |     except Exception as e:
 92 |         print(f"  [ERROR] Failed to generate or parse metadata from LLM. Error: {e}")
 93 |         return {"error": f"LLM metadata generation failed: {e}"}
 94 | 
 95 | # --- Main Script ---
 96 | if __name__ == "__main__":
 97 |     sample_doc_path = setup()
 98 |     
 99 |     # 1. Generate Metadata
100 |     document_text = sample_doc_path.read_text()
101 |     generated_metadata = generate_metadata_with_lollms(document_text)
102 | 
103 |     if "error" in generated_metadata:
104 |         print("\n  Proceeding with fallback metadata.")
105 |         generated_metadata = {
106 |             "title": "Fallback Title: Quantum Computing",
107 |             "summary": "A fallback summary about qubits and their challenges."
108 |         }
109 | 
110 |     # 2. Initialize SafeStore with Encryption
111 |     print_header("Initializing SafeStore with Encryption")
112 |     try:
113 |         # Note: We are now passing the encryption key
114 |         store = safe_store.SafeStore(
115 |             db_path=DB_FILE,
116 |             vectorizer_name="st",
117 |             vectorizer_config={"model": "all-MiniLM-L6-v2"},
118 |             log_level=LogLevel.INFO,
119 |             encryption_key=ENCRYPTION_KEY
120 |         )
121 |         print("  - SafeStore initialized.")
122 |     except Exception as e:
123 |         print(f"  [FATAL] Could not initialize SafeStore: {e}")
124 |         exit(1)
125 | 
126 |     # 3. Add the document WITH the generated metadata
127 |     print_header("Adding Document with Generated Metadata")
128 |     with store:
129 |         store.add_document(
130 |             file_path=sample_doc_path,
131 |             metadata=generated_metadata
132 |         )
133 |         print(f"  - Document '{sample_doc_path.name}' added to the store.")
134 | 
135 |         # 4. List documents to verify metadata storage (and encryption)
136 |         print("\n  --- Verifying Stored Documents ---")
137 |         docs = store.list_documents()
138 |         for doc in docs:
139 |             print(f"  - Found Doc ID: {doc['doc_id']}, Path: {doc['file_path']}")
140 |             print(f"    Metadata: {doc['metadata']}")
141 | 
142 |     # 5. Query the store and inspect the results
143 |     print_header("Querying the Store")
144 |     query = "What are the difficulties of building qubits?"
145 |     print(f"  - Query: '{query}'")
146 | 
147 |     with store:
148 |         results = store.query(query, top_k=1)
149 | 
150 |         if not results:
151 |             print("  - No results found.")
152 |         else:
153 |             top_result = results[0]
154 |             print("\n  --- Top Query Result ---")
155 |             print(f"  - Similarity: {top_result['similarity_percent']:.2f}%")
156 |             
157 |             print("\n  - Document Metadata (from result object):")
158 |             print(f"    {top_result['document_metadata']}")
159 |             
160 |             print("\n  - Full Chunk Text (with prepended context):")
161 |             print("-" * 50)
162 |             print(top_result['chunk_text'])
163 |             print("-" * 50)
164 |             
165 |             # Verification
166 |             assert "Document Context" in top_result['chunk_text']
167 |             assert generated_metadata['title'] in top_result['chunk_text']
168 |             print("\n  [SUCCESS] Verified that document metadata was prepended to the chunk text.")
169 | 
170 |     # 6. Final cleanup
171 |     print_header("Final Cleanup")
172 |     if DOC_DIR.exists():
173 |         shutil.rmtree(DOC_DIR)
174 |         print(f"- Removed temporary directory: {DOC_DIR}")
175 |     
176 |     print("\n--- Example Finished ---")


--------------------------------------------------------------------------------
/safe_store/security/encryption.py:
--------------------------------------------------------------------------------
  1 |     # safe_store/security/encryption.py
  2 | import base64
  3 | from typing import Optional, Tuple
  4 | 
  5 | from ascii_colors import ASCIIColors
  6 | from ..core.exceptions import EncryptionError, ConfigurationError
  7 | 
  8 | # Attempt import
  9 | try:
 10 |     from cryptography.fernet import Fernet, InvalidToken
 11 |     from cryptography.hazmat.primitives import hashes
 12 |     from cryptography.hazmat.primitives.kdf.pbkdf2 import PBKDF2HMAC
 13 |     CRYPTOGRAPHY_AVAILABLE = True
 14 | except ImportError:
 15 |     CRYPTOGRAPHY_AVAILABLE = False
 16 |     Fernet, InvalidToken = None, None
 17 |     PBKDF2HMAC, hashes = None, None
 18 | 
 19 | 
 20 | SALT_SIZE = 16 # Standard salt size for PBKDF2
 21 | # Recommended PBKDF2 iterations (adjust based on security needs vs performance)
 22 | # OWASP recommendation as of 2023 is >= 600,000 for SHA256
 23 | PBKDF2_ITERATIONS = 600_000
 24 | 
 25 | class Encryptor:
 26 |     """
 27 |     Handles symmetric encryption and decryption using Fernet (AES-128-CBC + HMAC).
 28 | 
 29 |     Derives a valid Fernet key from a user-provided password using PBKDF2.
 30 |     """
 31 | 
 32 |     def __init__(self, password: Optional[str]):
 33 |         """
 34 |         Initializes the Encryptor.
 35 | 
 36 |         Args:
 37 |             password: The password to use for encryption/decryption. If None,
 38 |                       encryption/decryption methods will raise errors.
 39 | 
 40 |         Raises:
 41 |             ConfigurationError: If 'cryptography' is not installed.
 42 |         """
 43 |         if not CRYPTOGRAPHY_AVAILABLE:
 44 |             msg = "Encryption features require 'cryptography'. Install with: pip install safe_store[encryption]"
 45 |             ASCIIColors.error(msg)
 46 |             raise ConfigurationError(msg)
 47 | 
 48 |         if password is None:
 49 |             self._fernet = None
 50 |             ASCIIColors.debug("Encryptor initialized without a password. Encryption/decryption disabled.")
 51 |         else:
 52 |             if not isinstance(password, str) or not password:
 53 |                 raise ValueError("Encryption password must be a non-empty string.")
 54 |             # Note: Storing the key directly is generally discouraged in production.
 55 |             # This simple implementation derives the key on init.
 56 |             # A more robust system might derive it on demand or use a dedicated key management system.
 57 |             key = self._derive_key(password)
 58 |             self._fernet = Fernet(key)
 59 |             ASCIIColors.debug("Encryptor initialized with password-derived key.")
 60 | 
 61 |     @staticmethod
 62 |     def _derive_key(password: str, salt: Optional[bytes] = None) -> bytes:
 63 |         """
 64 |         Derives a 32-byte key suitable for Fernet using PBKDF2HMAC-SHA256.
 65 | 
 66 |         A fixed salt is used here for simplicity, allowing the same password
 67 |         to always produce the same key. **WARNING:** In a real-world scenario,
 68 |         you'd typically generate a *unique* salt per encryption and store it
 69 |         alongside the ciphertext. However, for this use case (encrypting chunks
 70 |         all potentially decrypted with the same store instance/password), using a
 71 |         fixed derivative might be acceptable, though less ideal than per-chunk salts.
 72 |         Using a hardcoded salt makes it slightly less secure than generating one.
 73 |         Let's stick to a hardcoded one for simplicity of this library's scope,
 74 |         but document this limitation heavily.
 75 | 
 76 |         Args:
 77 |             password: The user-provided password string.
 78 |             salt: Optional salt (not used in this fixed-salt implementation).
 79 | 
 80 |         Returns:
 81 |             A URL-safe base64-encoded 32-byte key.
 82 |         """
 83 |         # --- !!! SECURITY WARNING !!! ---
 84 |         # Using a hardcoded salt is NOT best practice for general encryption.
 85 |         # It means the same password will always yield the same key, reducing
 86 |         # protection against rainbow table attacks compared to unique salts.
 87 |         # This is a simplification for this specific library context.
 88 |         # Consider generating and storing salts if higher security is needed.
 89 |         hardcoded_salt = b'safe_store_salt_' # 16 bytes
 90 | 
 91 |         if PBKDF2HMAC is None or hashes is None:
 92 |              # Should be caught by __init__ check, but defensive coding
 93 |              raise ConfigurationError("Cryptography library components missing for key derivation.")
 94 | 
 95 |         kdf = PBKDF2HMAC(
 96 |             algorithm=hashes.SHA256(),
 97 |             length=32, # Fernet keys are 32 bytes
 98 |             salt=hardcoded_salt,
 99 |             iterations=PBKDF2_ITERATIONS,
100 |         )
101 |         key = base64.urlsafe_b64encode(kdf.derive(password.encode('utf-8')))
102 |         return key
103 | 
104 |     @property
105 |     def is_enabled(self) -> bool:
106 |         """Returns True if encryption is configured (password provided)."""
107 |         return self._fernet is not None
108 | 
109 |     def encrypt(self, data: str) -> bytes:
110 |         """
111 |         Encrypts string data.
112 | 
113 |         Args:
114 |             data: The plaintext string to encrypt.
115 | 
116 |         Returns:
117 |             The encrypted data as bytes (Fernet token).
118 | 
119 |         Raises:
120 |             EncryptionError: If encryption is not enabled or fails.
121 |             TypeError: If input data is not a string.
122 |         """
123 |         if not self.is_enabled or self._fernet is None:
124 |             raise EncryptionError("Encryption is not enabled (no password provided).")
125 |         if not isinstance(data, str):
126 |             raise TypeError("Data to encrypt must be a string.")
127 | 
128 |         try:
129 |             encrypted_data = self._fernet.encrypt(data.encode('utf-8'))
130 |             return encrypted_data
131 |         except Exception as e:
132 |             msg = f"Encryption failed: {e}"
133 |             ASCIIColors.error(msg, exc_info=True)
134 |             raise EncryptionError(msg) from e
135 | 
136 |     def decrypt(self, token: bytes) -> str:
137 |         """
138 |         Decrypts a Fernet token back into a string.
139 | 
140 |         Args:
141 |             token: The encrypted data (Fernet token) as bytes.
142 | 
143 |         Returns:
144 |             The decrypted plaintext string.
145 | 
146 |         Raises:
147 |             EncryptionError: If decryption is not enabled, the token is invalid
148 |                              (tampered or wrong key), or decryption fails.
149 |             TypeError: If input token is not bytes.
150 |         """
151 |         if not self.is_enabled or self._fernet is None:
152 |             raise EncryptionError("Decryption is not enabled (no password provided).")
153 |         if not isinstance(token, bytes):
154 |             raise TypeError("Token to decrypt must be bytes.")
155 | 
156 |         try:
157 |             decrypted_data = self._fernet.decrypt(token)
158 |             return decrypted_data.decode('utf-8')
159 |         except InvalidToken:
160 |             msg = "Decryption failed: Invalid token (likely tampered or wrong key)."
161 |             ASCIIColors.error(msg)
162 |             raise EncryptionError(msg) from InvalidToken # Chain specific error
163 |         except Exception as e:
164 |             msg = f"Decryption failed: {e}"
165 |             ASCIIColors.error(msg, exc_info=True)
166 |             raise EncryptionError(msg) from e


--------------------------------------------------------------------------------
/safe_store/vectorization/methods/lollms/__init__.py:
--------------------------------------------------------------------------------
  1 | # safe_store/vectorization/methods/lollms.py
  2 | import numpy as np
  3 | from typing import List, Optional, Dict, Any
  4 | import os
  5 | 
  6 | from ...base import BaseVectorizer
  7 | from ....core.exceptions import ConfigurationError, VectorizationError
  8 | from ascii_colors import ASCIIColors
  9 | 
 10 | class_name = "LollmsVectorizer"
 11 | 
 12 | import pipmaster as pm
 13 | pm.ensure_packages(["openai"])
 14 | import openai
 15 | 
 16 | _OpenAIAPIError = openai.APIError
 17 | _OpenAIAuthenticationError = openai.AuthenticationError
 18 | _OpenAINotFoundError = openai.NotFoundError
 19 | _OpenAIRateLimitError = openai.RateLimitError
 20 | _OpenAIBadRequestError = openai.BadRequestError
 21 | _OpenAIAPIConnectionError = openai.APIConnectionError
 22 | _OPENAI_AVAILABLE = True
 23 | 
 24 | def list_available_models(**kwargs) -> List[str]:
 25 |     """
 26 |     Dynamically lists models from a running Lollms (OpenAI-compatible) server.
 27 |     """
 28 |     if not _OPENAI_AVAILABLE:
 29 |         raise ConfigurationError("Lollms support requires 'openai'. Please run: pip install safe_store[openai]")
 30 |     
 31 |     base_url = kwargs.get("base_url", "http://localhost:9600")
 32 |     api_key = kwargs.get("api_key", "not_needed")
 33 |     
 34 |     try:
 35 |         client = openai.Client(base_url=base_url, api_key=api_key)
 36 |         models = client.models.list()
 37 |         # The response is an object with a 'data' attribute which is a list of model objects
 38 |         return [model.id for model in models.data]
 39 |     except openai.APIConnectionError as e:
 40 |         raise VectorizationError(f"Could not connect to Lollms server at '{base_url}'. Please ensure it is running.") from e
 41 |     except Exception as e:
 42 |         raise VectorizationError(f"An unexpected error occurred while listing Lollms models: {e}") from e
 43 | 
 44 | class LollmsVectorizer(BaseVectorizer):
 45 |     """
 46 |     Vectorizes text using an OpenAI-compatible API, such as a local Lollms instance.
 47 | 
 48 |     Requires the `openai` Python library. The `model_config` dictionary specifies
 49 |     the model name and connection parameters.
 50 |     Example:
 51 |     `{"model": "nomic-embed-text", "base_url": "http://localhost:9600", "api_key": "..."}`
 52 | 
 53 |     Attributes:
 54 |         model_name (str): The name of the model to use for embeddings.
 55 |         api_key (Optional[str]): The API key for the service.
 56 |         base_url (Optional[str]): The base URL of the OpenAI-compatible API endpoint.
 57 |         client (openai.OpenAI): The OpenAI client instance.
 58 |     """
 59 | 
 60 |     def __init__(self, model_config: Dict[str, Any], **kwargs):
 61 |         """
 62 |         Initializes the LollmsVectorizer.
 63 | 
 64 |         Args:
 65 |             model_config: A dictionary with configuration. Must contain "model".
 66 |                           Can also contain "api_key" and "base_url".
 67 | 
 68 |         Raises:
 69 |             ConfigurationError: If 'openai' is not installed or config is invalid.
 70 |             VectorizationError: If connection or model test fails.
 71 |         """
 72 |         super().__init__(
 73 |             vectorizer_name="lollms"
 74 |         )
 75 |         if not _OPENAI_AVAILABLE or openai is None:
 76 |             msg = "LollmsVectorizer requires the 'openai' library. Install with: pip install safe_store[openai]"
 77 |             raise ConfigurationError(msg)
 78 | 
 79 |         if not isinstance(model_config, dict) or "model" not in model_config:
 80 |             msg = "Lollms vectorizer config must be a dictionary with a 'model' key."
 81 |             raise ConfigurationError(msg)
 82 | 
 83 |         self.model_name: str = model_config["model"]
 84 |         self.api_key: Optional[str] = model_config.get("api_key", "not_needed")
 85 |         self.base_url: Optional[str] = model_config.get("base_url")
 86 | 
 87 |         ASCIIColors.info(f"Initializing Lollms (OpenAI-compatible) client. Model: {self.model_name}, Base URL: {self.base_url}")
 88 |         try:
 89 |             self.client: openai.OpenAI = openai.OpenAI(api_key=self.api_key, base_url=self.base_url)
 90 | 
 91 |             ASCIIColors.debug(f"Testing model '{self.model_name}' and retrieving dimension...")
 92 |             test_prompt = "hello world"
 93 |             response = self.client.embeddings.create(
 94 |                 model=self.model_name,
 95 |                 input=[test_prompt]
 96 |             )
 97 | 
 98 |             if not response.data or not response.data[0].embedding:
 99 |                 raise VectorizationError(f"Model '{self.model_name}' did not return a valid embedding. Response: {response}")
100 |             
101 |             embedding = response.data[0].embedding
102 | 
103 |             if not isinstance(embedding, list) or not embedding:
104 |                 raise VectorizationError(f"Model '{self.model_name}' returned an invalid embedding structure. Embedding: {embedding}")
105 | 
106 |             self._dim = len(embedding)
107 |             if self._dim == 0:
108 |                 raise VectorizationError(f"Model '{self.model_name}' returned a zero-dimension embedding.")
109 | 
110 |             self._dtype = np.dtype(np.float32)
111 |             ASCIIColors.info(f"Model '{self.model_name}' ready. Dimension: {self._dim}, Dtype: {self._dtype.name}")
112 | 
113 |         except (_OpenAIAuthenticationError, _OpenAINotFoundError, _OpenAIBadRequestError, _OpenAIRateLimitError, _OpenAIAPIConnectionError, _OpenAIAPIError) as e:
114 |             msg = (f"API error for model '{self.model_name}': "
115 |                    f"HTTP {e.http_status if hasattr(e, 'http_status') else 'N/A'} - {e.code if hasattr(e, 'code') else 'N/A'} - {e.message}.")
116 |             raise VectorizationError(msg) from e
117 |         except Exception as e:
118 |             msg = f"Failed to initialize Lollms vectorizer or test model '{self.model_name}': {e}"
119 |             raise VectorizationError(msg) from e
120 | 
121 |     def vectorize(self, texts: List[str]) -> np.ndarray:
122 |         """
123 |         Generates vector embeddings for a list of texts.
124 | 
125 |         Args:
126 |             texts: A list of text strings to vectorize.
127 | 
128 |         Returns:
129 |             A 2D NumPy array of embeddings.
130 | 
131 |         Raises:
132 |             VectorizationError: If the embedding process fails.
133 |         """
134 |         if not texts:
135 |             return np.empty((0, self.dim), dtype=self.dtype)
136 | 
137 |         ASCIIColors.debug(f"Vectorizing {len(texts)} texts using Lollms model '{self.model_name}'...")
138 |         
139 |         embeddings_results = [None] * len(texts)
140 |         actual_texts_to_embed: List[str] = []
141 |         original_indices_for_api_texts: List[int] = []
142 | 
143 |         for i, text in enumerate(texts):
144 |             if not text.strip():
145 |                 embeddings_results[i] = np.zeros(self.dim, dtype=self._dtype)
146 |             else:
147 |                 actual_texts_to_embed.append(text)
148 |                 original_indices_for_api_texts.append(i)
149 |         
150 |         if actual_texts_to_embed:
151 |             try:
152 |                 response = self.client.embeddings.create(
153 |                     model=self.model_name,
154 |                     input=actual_texts_to_embed
155 |                 )
156 | 
157 |                 if len(response.data) != len(actual_texts_to_embed):
158 |                     raise VectorizationError(f"API returned {len(response.data)} embeddings for {len(actual_texts_to_embed)} inputs.")
159 | 
160 |                 for i, embedding_data in enumerate(response.data):
161 |                     original_idx = original_indices_for_api_texts[i]
162 |                     embedding_vector = embedding_data.embedding
163 |                     if not isinstance(embedding_vector, list) or len(embedding_vector) != self.dim:
164 |                         raise VectorizationError(f"Model '{self.model_name}' returned an invalid embedding for text at original index {original_idx}.")
165 |                     embeddings_results[original_idx] = embedding_vector
166 | 
167 |             except (_OpenAIBadRequestError, _OpenAIRateLimitError, _OpenAIAPIConnectionError, _OpenAIAPIError) as e:
168 |                 msg = f"Lollms API error during vectorization: {e.message if hasattr(e, 'message') else str(e)}"
169 |                 raise VectorizationError(msg) from e
170 |             except Exception as e:
171 |                 msg = f"Unexpected error during Lollms vectorization: {e}"
172 |                 raise VectorizationError(msg) from e
173 | 
174 |         embeddings_array = np.array(embeddings_results, dtype=self._dtype)
175 | 
176 |         if embeddings_array.ndim != 2 or embeddings_array.shape[0] != len(texts) or embeddings_array.shape[1] != self.dim:
177 |              raise VectorizationError(f"Vectorization resulted in unexpected shape {embeddings_array.shape}. Expected ({len(texts)}, {self.dim}).")
178 | 
179 |         ASCIIColors.debug(f"Lollms vectorization complete. Output shape: {embeddings_array.shape}")
180 |         return embeddings_array
181 | 
182 |     @property
183 |     def dim(self) -> int:
184 |         return self._dim
185 | 
186 |     @property
187 |     def dtype(self) -> np.dtype:
188 |         return self._dtype
189 |     
190 |     @staticmethod
191 |     def list_models(**kwargs) -> List[str]:
192 |         """Listing models is dependent on the lollms binding and not exposed via client yet."""
193 |         return []


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
  1 | # tests/conftest.py
  2 | import pytest
  3 | from pathlib import Path
  4 | import sqlite3
  5 | import shutil
  6 | import numpy as np
  7 | from unittest.mock import MagicMock
  8 | 
  9 | # Import the class for type hinting
 10 | from safe_store import SafeStore, LogLevel
 11 | 
 12 | # --- Fixture Directory ---
 13 | FIXTURES_DIR = Path(__file__).parent / "fixtures"
 14 | 
 15 | # --- Dependency Availability Check ---
 16 | # Check for sentence-transformers
 17 | try:
 18 |     from sentence_transformers import SentenceTransformer
 19 |     SENTENCE_TRANSFORMERS_AVAILABLE = True
 20 | except ImportError:
 21 |     SENTENCE_TRANSFORMERS_AVAILABLE = False
 22 |     SentenceTransformer = None # Define as None if not available
 23 | 
 24 | # Check for scikit-learn
 25 | try:
 26 |     from sklearn.feature_extraction.text import TfidfVectorizer
 27 |     from sklearn.exceptions import NotFittedError
 28 |     SKLEARN_AVAILABLE = True
 29 | except ImportError:
 30 |     SKLEARN_AVAILABLE = False
 31 |     TfidfVectorizer = None
 32 |     NotFittedError = None
 33 | 
 34 | # --- Global Mocking Fixtures ---
 35 | 
 36 | # Mock SentenceTransformer if not available
 37 | if not SENTENCE_TRANSFORMERS_AVAILABLE:
 38 |     class MockSentenceTransformer:
 39 |         DEFAULT_MODEL = "mock-st-model"
 40 |         def __init__(self, model_name):
 41 |             self.model_name = model_name
 42 |             self._dim = 384
 43 |             self._dtype = np.float32
 44 |         def encode(self, texts, convert_to_numpy=True, show_progress_bar=False):
 45 |              if not texts: return np.empty((0, self._dim), dtype=self._dtype)
 46 |              return np.random.rand(len(texts), self._dim).astype(self._dtype)
 47 |         def get_sentence_embedding_dimension(self): return self._dim
 48 |         @property
 49 |         def dim(self): return self._dim
 50 |         @property
 51 |         def dtype(self): return self._dtype
 52 | 
 53 |     @pytest.fixture(scope="session", autouse=True)
 54 |     def mock_st_globally(session_mocker):
 55 |         # Use session_mocker if available, otherwise regular monkeypatch might work in module scope
 56 |         # Using monkeypatch fixture is generally preferred within test functions/fixtures
 57 |         # For autouse session scope, directly patching might be necessary if mocker isn't standard
 58 |         # Let's use monkeypatch fixture within other fixtures instead for safety.
 59 |         pass # We will apply this mock conditionally in test files or fixtures needing it
 60 | 
 61 | # Mock Scikit-learn if not available
 62 | if not SKLEARN_AVAILABLE:
 63 |     class MockTfidfVectorizer:
 64 |         def __init__(self, **kwargs):
 65 |             self.params = kwargs; self._fitted = False; self.vocabulary_ = {}; self.idf_ = np.array([])
 66 |             self.dtype = np.float64
 67 |             if 'dtype' in kwargs:
 68 |                  try: self.dtype = np.dtype(kwargs['dtype'])
 69 |                  except: pass
 70 |             class MockTfidfInternal: _idf_diag = MagicMock()
 71 |             self._tfidf = MockTfidfInternal(); self._dim = None
 72 |         def fit(self, texts):
 73 |             if not texts: self.vocabulary_ = {}; self.idf_ = np.array([], dtype=self.dtype); self._dim = 0
 74 |             else:
 75 |                 words = set(w for t in texts for w in t.lower().split())
 76 |                 self.vocabulary_ = {w: i for i, w in enumerate(sorted(list(words)))}
 77 |                 if not self.vocabulary_: self._dim = 0; self.idf_ = np.array([], dtype=self.dtype)
 78 |                 else: self._dim = len(self.vocabulary_); self.idf_ = np.random.rand(self._dim).astype(self.dtype)*5+1
 79 |             self._fitted = True
 80 |             if hasattr(self, '_tfidf') and hasattr(self._tfidf, '_idf_diag'): self._tfidf._idf_diag.dtype = self.dtype
 81 |             return self
 82 |         def transform(self, texts):
 83 |             if not self._fitted: raise (NotFittedError or Exception)("MockTfidfVectorizer not fitted")
 84 |             if not texts: return MagicMock(**{'toarray.return_value': np.empty((0, self._dim or 0), dtype=self.dtype)})
 85 |             num_samples=len(texts); vocab_size=self._dim if self._dim is not None else 0
 86 |             if vocab_size is None: vocab_size = 0
 87 |             dense_array = np.random.rand(num_samples, vocab_size).astype(self.dtype)
 88 |             return MagicMock(**{'toarray.return_value': dense_array, 'shape': dense_array.shape})
 89 |         def get_params(self, deep=True): return self.params
 90 |         @property
 91 |         def dim(self): return self._dim
 92 | 
 93 |     @pytest.fixture(scope="session", autouse=True)
 94 |     def mock_sklearn_globally(session_mocker):
 95 |         # Similar caveat as mock_st_globally - apply conditionally where needed
 96 |         pass
 97 | 
 98 | 
 99 | # --- Helper to conditionally apply mocks ---
100 | @pytest.fixture(autouse=True)
101 | def apply_mocks_conditionally(monkeypatch):
102 |     """Applies mocks only if the libraries are unavailable."""
103 |     if not SENTENCE_TRANSFORMERS_AVAILABLE:
104 |         monkeypatch.setattr("safe_store.vectorization.methods.sentence_transformer.SentenceTransformer", MockSentenceTransformer, raising=False)
105 |         monkeypatch.setattr("safe_store.vectorization.methods.sentence_transformer._SENTENCE_TRANSFORMERS_AVAILABLE", True, raising=False) # Make wrapper think it's ok
106 |     if not SKLEARN_AVAILABLE:
107 |         monkeypatch.setattr("safe_store.vectorization.methods.tfidf.TfidfVectorizer", MockTfidfVectorizer, raising=False)
108 |         monkeypatch.setattr("safe_store.vectorization.methods.tfidf.NotFittedError", NotFittedError or Exception, raising=False)
109 |         monkeypatch.setattr("safe_store.vectorization.methods.tfidf._SKLEARN_AVAILABLE", True, raising=False) # Make wrapper think it's ok
110 | 
111 | 
112 | # --- Standard Fixtures ---
113 | @pytest.fixture(scope="function")
114 | def temp_db_path(tmp_path: Path) -> Path:
115 |     """Provides a path to a temporary database file."""
116 |     return tmp_path / "test_safe_store.db"
117 | 
118 | @pytest.fixture(scope="function")
119 | def safe_store_instance(temp_db_path: Path) -> SafeStore:
120 |     """Provides a safe_store instance with a clean temporary database."""
121 |     # Ensure clean slate
122 |     if temp_db_path.exists(): temp_db_path.unlink()
123 |     lock_path = temp_db_path.with_suffix(".db.lock")
124 |     if lock_path.exists(): lock_path.unlink()
125 |     wal_path = temp_db_path.with_suffix(".db-wal")
126 |     if wal_path.exists(): wal_path.unlink(missing_ok=True)
127 |     shm_path = temp_db_path.with_suffix(".db-shm")
128 |     if shm_path.exists(): shm_path.unlink(missing_ok=True)
129 | 
130 |     # Use DEBUG level for more verbose test output
131 |     store = SafeStore(db_path=temp_db_path, log_level=LogLevel.DEBUG, lock_timeout=0.1)
132 |     yield store
133 |     store.close() # Ensure closure after test function finishes
134 | 
135 | @pytest.fixture(scope="session")
136 | def sample_text_content() -> str:
137 |     return "This is the first sentence.\nThis is the second sentence, it is a bit longer.\nAnd a third one."
138 | 
139 | @pytest.fixture
140 | def sample_text_file(tmp_path: Path, sample_text_content: str) -> Path:
141 |     """Creates a temporary text file."""
142 |     p = tmp_path / "sample.txt"
143 |     p.write_text(sample_text_content, encoding='utf-8')
144 |     return p
145 | 
146 | # --- Phase 3 Fixtures ---
147 | @pytest.fixture
148 | def sample_pdf_file(tmp_path: Path) -> Path:
149 |     """Copies the sample PDF to the temp directory."""
150 |     source = FIXTURES_DIR / "sample.pdf"
151 |     if not source.exists(): pytest.skip("sample.pdf fixture file not found")
152 |     dest = tmp_path / "sample.pdf"
153 |     try: shutil.copy(source, dest)
154 |     except Exception as e: pytest.fail(f"Failed to copy fixture file {source}: {e}")
155 |     return dest
156 | 
157 | @pytest.fixture
158 | def sample_docx_file(tmp_path: Path) -> Path:
159 |     """Copies the sample DOCX to the temp directory."""
160 |     source = FIXTURES_DIR / "sample.docx"
161 |     if not source.exists(): pytest.skip("sample.docx fixture file not found")
162 |     dest = tmp_path / "sample.docx"
163 |     try: shutil.copy(source, dest)
164 |     except Exception as e: pytest.fail(f"Failed to copy fixture file {source}: {e}")
165 |     return dest
166 | 
167 | @pytest.fixture
168 | def sample_html_file(tmp_path: Path) -> Path:
169 |     """Copies the sample HTML to the temp directory."""
170 |     source = FIXTURES_DIR / "sample.html"
171 |     if not source.exists(): pytest.skip("sample.html fixture file not found")
172 |     dest = tmp_path / "sample.html"
173 |     try: shutil.copy(source, dest)
174 |     except Exception as e: pytest.fail(f"Failed to copy fixture file {source}: {e}")
175 |     return dest
176 | 
177 | 
178 | # --- Phase 2 Fixture ---
179 | @pytest.fixture
180 | def populated_store(safe_store_instance: SafeStore, sample_text_file: Path, tmp_path: Path) -> SafeStore:
181 |     """Provides a safe_store instance with two documents added using the default ST vectorizer."""
182 |     store = safe_store_instance
183 |     doc2_content = "Another document.\nWith different content for testing."
184 |     doc2_path = tmp_path / "sample2.txt"
185 |     doc2_path.write_text(doc2_content, encoding='utf-8')
186 | 
187 |     # No need for availability check here due to global autouse fixture apply_mocks_conditionally
188 |     try:
189 |         with store:
190 |             store.add_document(sample_text_file, chunk_size=30, chunk_overlap=5)
191 |             store.add_document(doc2_path, chunk_size=25, chunk_overlap=5)
192 |     except Exception as e:
193 |          pytest.fail(f"Populated store fixture setup failed: {e}")
194 | 
195 |     return store


--------------------------------------------------------------------------------
/tests/test_store_phase3.py:
--------------------------------------------------------------------------------
  1 | # tests/test_store_phase3.py
  2 | import pytest
  3 | import sqlite3
  4 | from pathlib import Path
  5 | from unittest.mock import patch, MagicMock, call
  6 | import re
  7 | 
  8 | # Import exceptions and modules
  9 | from safe_store import store as safe_store_store_module
 10 | from safe_store import SafeStore, LogLevel
 11 | from safe_store.core import db
 12 | from safe_store.core.exceptions import ConcurrencyError, FileHandlingError, ConfigurationError, SafeStoreError
 13 | 
 14 | # Import filelock components
 15 | from filelock import Timeout, FileLock
 16 | 
 17 | # --- FIX: Define availability check locally ---
 18 | # Check for parsing libraries availability within this module
 19 | try:
 20 |     import pypdf
 21 |     import docx
 22 |     import bs4
 23 |     import lxml
 24 |     PARSING_LIBS_AVAILABLE = True
 25 | except ImportError:
 26 |     PARSING_LIBS_AVAILABLE = False
 27 | 
 28 | # --- Test Constants ---
 29 | PDF_TEXT = "This is PDF content."
 30 | DOCX_TEXT = "This is DOCX content."
 31 | HTML_TEXT = "This is HTML content." # Adjusted based on sample file
 32 | 
 33 | # Helper for log checks
 34 | def assert_log_call_containing(mock_logger, expected_substring):
 35 |     """Checks if any call to the mock logger contained the substring."""
 36 |     found = False
 37 |     for call_args in mock_logger.call_args_list:
 38 |         args, kwargs = call_args
 39 |         if args and isinstance(args[0], str) and expected_substring in args[0]:
 40 |             found = True
 41 |             break
 42 |     if not found:
 43 |         for method_call in mock_logger.method_calls:
 44 |             call_name, args, kwargs = method_call
 45 |             if args and isinstance(args[0], str) and expected_substring in args[0]:
 46 |                 found = True
 47 |                 break
 48 |     assert found, f"Expected log call containing '{expected_substring}' not found in {mock_logger.call_args_list} or {mock_logger.method_calls}"
 49 | 
 50 | 
 51 | # --- Parser Integration Tests ---
 52 | 
 53 | # Use the locally defined PARSING_LIBS_AVAILABLE for skipif
 54 | @pytest.mark.skipif(not PARSING_LIBS_AVAILABLE, reason="Requires parsing dependencies (pypdf, python-docx, beautifulsoup4, lxml)")
 55 | @patch('safe_store.indexing.parser.ASCIIColors')
 56 | @patch('safe_store.store.ASCIIColors')
 57 | def test_add_document_pdf(mock_store_colors, mock_parser_colors, safe_store_instance: SafeStore, sample_pdf_file: Path):
 58 |     """Test adding a PDF document via safe_store.add_document."""
 59 |     store = safe_store_instance
 60 |     with store:
 61 |         store.add_document(sample_pdf_file, chunk_size=50, chunk_overlap=10)
 62 | 
 63 |     assert_log_call_containing(mock_store_colors.info, f"Starting indexing process for: {sample_pdf_file.name}")
 64 |     assert_log_call_containing(mock_parser_colors.debug, "Dispatching parser for extension '.pdf'")
 65 |     assert_log_call_containing(mock_parser_colors.debug, f"Attempting to parse PDF file: {sample_pdf_file}")
 66 |     assert_log_call_containing(mock_store_colors.info, "Generated 1 chunks for")
 67 |     assert_log_call_containing(mock_store_colors.success, f"Successfully processed '{sample_pdf_file.name}'")
 68 |     mock_store_colors.error.assert_not_called()
 69 | 
 70 |     conn = sqlite3.connect(store.db_path)
 71 |     cursor = conn.cursor()
 72 |     cursor.execute("SELECT doc_id, full_text FROM documents WHERE file_path = ?", (str(sample_pdf_file.resolve()),))
 73 |     doc_result = cursor.fetchone(); assert doc_result is not None
 74 |     doc_id = doc_result[0]; parsed_text = doc_result[1]
 75 |     assert len(parsed_text) > 5; assert "pdf" in parsed_text.lower()
 76 |     cursor.execute("SELECT COUNT(*) FROM chunks WHERE doc_id = ?", (doc_id,))
 77 |     chunk_count = cursor.fetchone()[0]; assert chunk_count == 1
 78 |     cursor.execute("SELECT COUNT(v.vector_id) FROM vectors v JOIN chunks c ON v.chunk_id = c.chunk_id WHERE c.doc_id = ?", (doc_id,))
 79 |     vector_count = cursor.fetchone()[0]; assert vector_count == 1
 80 |     conn.close()
 81 | 
 82 | 
 83 | @pytest.mark.skipif(not PARSING_LIBS_AVAILABLE, reason="Requires parsing dependencies (pypdf, python-docx, beautifulsoup4, lxml)")
 84 | @patch('safe_store.indexing.parser.ASCIIColors')
 85 | @patch('safe_store.store.ASCIIColors')
 86 | def test_add_document_docx(mock_store_colors, mock_parser_colors, safe_store_instance: SafeStore, sample_docx_file: Path):
 87 |     """Test adding a DOCX document via safe_store.add_document."""
 88 |     store = safe_store_instance
 89 |     with store:
 90 |         store.add_document(sample_docx_file, chunk_size=50, chunk_overlap=10)
 91 | 
 92 |     assert_log_call_containing(mock_store_colors.info, f"Starting indexing process for: {sample_docx_file.name}")
 93 |     assert_log_call_containing(mock_parser_colors.debug, "Dispatching parser for extension '.docx'")
 94 |     assert_log_call_containing(mock_parser_colors.debug, f"Attempting to parse DOCX file: {sample_docx_file}")
 95 |     assert_log_call_containing(mock_store_colors.info, "Generated 1 chunks for")
 96 |     assert_log_call_containing(mock_store_colors.success, f"Successfully processed '{sample_docx_file.name}'")
 97 |     mock_store_colors.error.assert_not_called()
 98 | 
 99 |     conn = sqlite3.connect(store.db_path)
100 |     cursor = conn.cursor()
101 |     cursor.execute("SELECT doc_id, full_text FROM documents WHERE file_path = ?", (str(sample_docx_file.resolve()),))
102 |     doc_result = cursor.fetchone(); assert doc_result is not None
103 |     doc_id = doc_result[0]; assert DOCX_TEXT == doc_result[1].strip()
104 |     cursor.execute("SELECT COUNT(*) FROM chunks WHERE doc_id = ?", (doc_id,))
105 |     chunk_count = cursor.fetchone()[0]; assert chunk_count == 1
106 |     cursor.execute("SELECT COUNT(v.vector_id) FROM vectors v JOIN chunks c ON v.chunk_id = c.chunk_id WHERE c.doc_id = ?", (doc_id,))
107 |     vector_count = cursor.fetchone()[0]; assert vector_count == 1
108 |     conn.close()
109 | 
110 | 
111 | @pytest.mark.skipif(not PARSING_LIBS_AVAILABLE, reason="Requires parsing dependencies (pypdf, python-docx, beautifulsoup4, lxml)")
112 | @patch('safe_store.indexing.parser.ASCIIColors')
113 | @patch('safe_store.store.ASCIIColors')
114 | def test_add_document_html(mock_store_colors, mock_parser_colors, safe_store_instance: SafeStore, sample_html_file: Path):
115 |     """Test adding an HTML document via safe_store.add_document."""
116 |     store = safe_store_instance
117 |     with store:
118 |         store.add_document(sample_html_file, chunk_size=50, chunk_overlap=10)
119 | 
120 |     assert_log_call_containing(mock_store_colors.info, f"Starting indexing process for: {sample_html_file.name}")
121 |     assert_log_call_containing(mock_parser_colors.debug, "Dispatching parser for extension '.html'")
122 |     assert_log_call_containing(mock_parser_colors.debug, f"Attempting to parse HTML file: {sample_html_file}")
123 |     assert_log_call_containing(mock_store_colors.info, "Generated 1 chunks for")
124 |     assert_log_call_containing(mock_store_colors.success, f"Successfully processed '{sample_html_file.name}'")
125 |     mock_store_colors.error.assert_not_called()
126 | 
127 |     conn = sqlite3.connect(store.db_path)
128 |     cursor = conn.cursor()
129 |     cursor.execute("SELECT doc_id, full_text FROM documents WHERE file_path = ?", (str(sample_html_file.resolve()),))
130 |     doc_result = cursor.fetchone(); assert doc_result is not None
131 |     doc_id = doc_result[0]; assert HTML_TEXT == doc_result[1].strip()
132 |     cursor.execute("SELECT COUNT(*) FROM chunks WHERE doc_id = ?", (doc_id,))
133 |     chunk_count = cursor.fetchone()[0]; assert chunk_count == 1
134 |     cursor.execute("SELECT COUNT(v.vector_id) FROM vectors v JOIN chunks c ON v.chunk_id = c.chunk_id WHERE c.doc_id = ?", (doc_id,))
135 |     vector_count = cursor.fetchone()[0]; assert vector_count == 1
136 |     conn.close()
137 | 
138 | # --- Concurrency Tests ---
139 | @patch('safe_store.store.ASCIIColors')
140 | def test_add_document_lock_acquired(mock_store_colors, safe_store_instance: SafeStore, sample_text_file: Path):
141 |     """Test that add_document acquires and releases the lock."""
142 |     store = safe_store_instance
143 |     with patch.object(store, '_file_lock', autospec=True) as mock_lock_instance:
144 |         mock_lock_instance.__enter__.return_value = None
145 |         mock_lock_instance.__exit__.return_value = None
146 |         with store:
147 |             store.add_document(sample_text_file)
148 |         mock_lock_instance.__enter__.assert_called_once()
149 |         mock_lock_instance.__exit__.assert_called_once()
150 | 
151 |     assert_log_call_containing(mock_store_colors.debug, "Attempting to acquire write lock for add_document")
152 |     assert_log_call_containing(mock_store_colors.info, "Write lock acquired for add_document")
153 |     assert_log_call_containing(mock_store_colors.debug, f"Write lock released for add_document: {sample_text_file.name}")
154 |     assert_log_call_containing(mock_store_colors.success, f"Successfully processed '{sample_text_file.name}'")
155 | 
156 | 
157 | @patch('safe_store.store.ASCIIColors')
158 | def test_add_document_lock_timeout(mock_store_colors, safe_store_instance: SafeStore, sample_text_file: Path):
159 |     """Test that add_document handles a lock timeout."""
160 |     store = safe_store_instance
161 |     mock_lock_instance = MagicMock(spec=FileLock)
162 |     timeout_exception = Timeout(store.lock_path)
163 |     mock_lock_instance.__enter__.side_effect = timeout_exception
164 | 
165 |     with patch.object(store, '_file_lock', mock_lock_instance):
166 |         expected_error_msg = f"Timeout ({store.lock_timeout}s) acquiring write lock for add_document: {sample_text_file.name}"
167 |         with pytest.raises(ConcurrencyError, match=re.escape(expected_error_msg)):
168 |              store.add_document(sample_text_file)
169 |         mock_lock_instance.__enter__.assert_called_once()
170 |         mock_lock_instance.__exit__.assert_not_called()
171 | 
172 |     assert_log_call_containing(mock_store_colors.debug, "Attempting to acquire write lock for add_document")
173 |     assert_log_call_containing(mock_store_colors.error, expected_error_msg)
174 |     mock_store_colors.success.assert_not_called()
175 | 
176 | 


--------------------------------------------------------------------------------
/point_cloud_web_app/index.html:
--------------------------------------------------------------------------------
  1 | 
  2 | <!DOCTYPE html>
  3 | <html lang="en">
  4 | <head>
  5 |     <meta charset="UTF-8">
  6 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
  7 |     <title>SafeStore | 2D Chunk Visualization</title>
  8 |     <script src="https://cdn.tailwindcss.com"></script>
  9 |     <script src="https://cdn.plot.ly/plotly-2.32.0.min.js"></script>
 10 |     <style>
 11 |         /* Custom scrollbar for a more subtle look */
 12 |         .custom-scrollbar::-webkit-scrollbar { width: 8px; }
 13 |         .custom-scrollbar::-webkit-scrollbar-track { background: transparent; }
 14 |         .custom-scrollbar::-webkit-scrollbar-thumb { background-color: rgba(100, 116, 139, 0.5); border-radius: 4px; }
 15 |         .custom-scrollbar::-webkit-scrollbar-thumb:hover { background-color: rgba(100, 116, 139, 0.8); }
 16 |     </style>
 17 | </head>
 18 | <body class="bg-slate-50 dark:bg-slate-900 text-slate-800 dark:text-slate-200 font-sans antialiased">
 19 | 
 20 |     <main class="container mx-auto p-4 md:p-8">
 21 |         <!-- Header -->
 22 |         <header class="text-center mb-8 md:mb-12">
 23 |             <h1 class="text-3xl md:text-4xl font-bold text-slate-900 dark:text-white">
 24 |                 2D Document Chunk Visualization
 25 |             </h1>
 26 |             <p class="mt-2 text-lg text-slate-600 dark:text-slate-400 max-w-3xl mx-auto">
 27 |                 An interactive PCA plot of vectorized document chunks. Each point represents a piece of text, clustered by semantic similarity. Powered by <code class="font-mono text-sm bg-slate-200 dark:bg-slate-700 p-1 rounded">SafeStore</code>.
 28 |             </p>
 29 |         </header>
 30 | 
 31 |         <!-- Main Content Area -->
 32 |         <div class="grid grid-cols-1 lg:grid-cols-5 gap-8">
 33 | 
 34 |             <!-- Plot Card -->
 35 |             <div class="lg:col-span-3 bg-white dark:bg-slate-800 rounded-xl shadow-lg p-4 sm:p-6 h-[60vh] md:h-[70vh]">
 36 |                 <div id="plot" class="w-full h-full"></div>
 37 |             </div>
 38 | 
 39 |             <!-- Chunk Info Card -->
 40 |             <div class="lg:col-span-2 bg-white dark:bg-slate-800 rounded-xl shadow-lg p-4 sm:p-6">
 41 |                 <h2 class="text-2xl font-semibold text-slate-900 dark:text-white mb-4">Chunk Inspector</h2>
 42 |                 <div id="chunk-info-container" class="relative min-h-[300px] lg:h-[calc(70vh-80px)]">
 43 |                     <!-- This content will be replaced by JS -->
 44 |                 </div>
 45 |             </div>
 46 |         </div>
 47 |     </main>
 48 | 
 49 |     <script>
 50 |         document.addEventListener('DOMContentLoaded', function() {
 51 |             const chunkCache = new Map();
 52 |             const chunkInfoContainer = document.getElementById('chunk-info-container');
 53 |             const plotDiv = document.getElementById('plot');
 54 | 
 55 |             const states = {
 56 |                 initial: `
 57 |                     <div class="flex flex-col items-center justify-center h-full text-center text-slate-500 dark:text-slate-400">
 58 |                         <svg class="w-12 h-12 mb-4" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" stroke-width="1.5" stroke="currentColor"><path stroke-linecap="round" stroke-linejoin="round" d="M15.042 21.672L13.684 16.6m0 0l-2.51 2.225.569-9.47 5.227 7.917-3.286-.672zm-7.518-.267A8.25 8.25 0 1120.25 10.5M8.288 14.212A5.25 5.25 0 1117.25 10.5" /></svg>
 59 |                         <h3 class="font-semibold text-lg">Hover over a point</h3>
 60 |                         <p class="text-sm">The text content of the selected chunk will appear here.</p>
 61 |                     </div>`,
 62 |                 loading: `
 63 |                     <div class="flex items-center justify-center h-full text-center text-slate-500 dark:text-slate-400">
 64 |                         <svg class="animate-spin -ml-1 mr-3 h-5 w-5" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24"><circle class="opacity-25" cx="12" cy="12" r="10" stroke="currentColor" stroke-width="4"></circle><path class="opacity-75" fill="currentColor" d="M4 12a8 8 0 018-8V0C5.373 0 0 5.373 0 12h4zm2 5.291A7.962 7.962 0 014 12H0c0 3.042 1.135 5.824 3 7.938l3-2.647z"></path></svg>
 65 |                         <span>Loading chunk text...</span>
 66 |                     </div>`,
 67 |                 error: (message) => `
 68 |                     <div class="flex flex-col items-center justify-center h-full text-center text-red-500">
 69 |                         <h3 class="font-semibold text-lg">Error</h3>
 70 |                         <p class="text-sm">${message}</p>
 71 |                     </div>`,
 72 |                 content: (data) => `
 73 |                     <div class="custom-scrollbar h-full overflow-y-auto bg-slate-50 dark:bg-slate-700/50 p-4 rounded-lg">
 74 |                         <div class="font-mono text-xs text-slate-500 dark:text-slate-400 mb-4">
 75 |                             <p><strong>Chunk ID:</strong> ${data.chunk_id}</p>
 76 |                             <p><strong>Document:</strong> ${data.file_path.split(/[\\/]/).pop()}</p>
 77 |                             <p><strong>Metadata:</strong> ${JSON.stringify(data.document_metadata)}</p>
 78 |                         </div>
 79 |                         <p class="whitespace-pre-wrap font-mono text-sm text-slate-700 dark:text-slate-300">${data.chunk_text}</p>
 80 |                     </div>`
 81 |             };
 82 | 
 83 |             function setInfoState(state, data = null) {
 84 |                 chunkInfoContainer.innerHTML = typeof states[state] === 'function' ? states[state](data) : states[state];
 85 |             }
 86 | 
 87 |             async function fetchChunkData(chunkId) {
 88 |                 if (chunkCache.has(chunkId)) {
 89 |                     return chunkCache.get(chunkId);
 90 |                 }
 91 |                 setInfoState('loading');
 92 |                 try {
 93 |                     const response = await fetch('/chunk/' + chunkId);
 94 |                     if (!response.ok) { throw new Error(`Server returned status ${response.status}`); }
 95 |                     const data = await response.json();
 96 |                     chunkCache.set(chunkId, data);
 97 |                     return data;
 98 |                 } catch (error) {
 99 |                     console.error("Fetch error:", error);
100 |                     return { error: `Could not fetch data for chunk ${chunkId}.` };
101 |                 }
102 |             }
103 | 
104 |             // Set initial state immediately for better UX
105 |             setInfoState('initial');
106 | 
107 |             fetch('data.json')
108 |                 .then(response => response.json())
109 |                 .then(data => {
110 |                     if (!data || data.length === 0) {
111 |                         plotDiv.innerHTML = '<div class="flex items-center justify-center h-full text-slate-500">No point cloud data found.</div>';
112 |                         return;
113 |                     }
114 | 
115 |                     const uniqueDocs = [...new Set(data.map(d => d.document_path))];
116 |                     const traces = uniqueDocs.map(docPath => {
117 |                         const points = data.filter(d => d.document_path === docPath);
118 |                         // CORRECTED: The JS template literal `${...}` needs to be escaped for Python's f-string by doubling the braces `{{...}}`
119 |                         // But since this is a multi-line string literal, we don't need f-string formatting here. The original issue was a typo in the variable.
120 |                         const textLabels = points.map(p => `<b>${p.document_title}</b><br>Topic: ${p.metadata.topic || 'N/A'}`);
121 |                         return {
122 |                             x: points.map(p => p.x),
123 |                             y: points.map(p => p.y), // This was the original typo location
124 |                             mode: 'markers',
125 |                             type: 'scatter',
126 |                             name: points[0].document_title,
127 |                             text: textLabels, // Using the correctly generated labels
128 |                             customdata: points.map(p => p.chunk_id),
129 |                             hoverinfo: 'text',
130 |                             marker: { size: 12, opacity: 0.8 }
131 |                         };
132 |                     });
133 | 
134 |                     const isDarkMode = window.matchMedia && window.matchMedia('(prefers-color-scheme: dark)').matches;
135 |                     const layout = {
136 |                         title: 'PCA of Document Chunks',
137 |                         hovermode: 'closest',
138 |                         showlegend: true,
139 |                         paper_bgcolor: 'rgba(0,0,0,0)',
140 |                         plot_bgcolor: 'rgba(0,0,0,0)',
141 |                         font: { color: isDarkMode ? '#e2e8f0' : '#334155' },
142 |                         legend: { title: { text: 'Documents' } },
143 |                         xaxis: { title: 'PCA Component 1', gridcolor: isDarkMode ? '#334155' : '#e2e8f0' },
144 |                         yaxis: { title: 'PCA Component 2', gridcolor: isDarkMode ? '#334155' : '#e2e8f0' }
145 |                     };
146 | 
147 |                     Plotly.newPlot('plot', traces, layout, {responsive: true});
148 | 
149 |                     plotDiv.on('plotly_hover', async function(eventData) {
150 |                         const chunkId = eventData.points[0].customdata;
151 |                         const data = await fetchChunkData(chunkId);
152 |                         if (data.error) {
153 |                             setInfoState('error', data.error);
154 |                         } else {
155 |                             setInfoState('content', data);
156 |                         }
157 |                     });
158 | 
159 |                     plotDiv.on('plotly_unhover', () => setInfoState('initial'));
160 |                 })
161 |                 .catch(err => {
162 |                     console.error("Error loading data.json:", err);
163 |                     setInfoState('error', 'Could not load data.json. Is the server running correctly?');
164 |                 });
165 |         });
166 |     </script>
167 | </body>
168 | </html>
169 | 


--------------------------------------------------------------------------------
/safe_store/vectorization/methods/cohere/__init__.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from typing import List, Optional, Dict, Any
  3 | import os
  4 | 
  5 | from ...base import BaseVectorizer
  6 | from ....core.exceptions import ConfigurationError, VectorizationError
  7 | from ascii_colors import ASCIIColors
  8 | 
  9 | # each vectorizer must have a class name variable to be identified
 10 | class_name = "CohereVectorizer"
 11 | 
 12 | def list_available_models(**kwargs) -> List[str]:
 13 |     """
 14 |     Returns a static list of common Cohere embedding models.
 15 |     """
 16 |     return ["embed-english-v3.0", "embed-english-light-v3.0", "embed-multilingual-v3.0"]
 17 | 
 18 | # Attempt import of cohere and related error types, handle gracefully
 19 | try:
 20 |     import pipmaster as pm
 21 |     pm.ensure_packages(["cohere"]) # Ensure cohere is installed
 22 |     import safe_store.vectorization.methods.cohere as cohere
 23 |     _CohereAPIError = cohere.APIError
 24 |     _CohereConnectionError = cohere.ConnectionError
 25 |     _COHERE_AVAILABLE = True
 26 | except ImportError:
 27 |     _COHERE_AVAILABLE = False
 28 |     cohere = None
 29 |     class _CohereAPIError(Exception): pass
 30 |     class _CohereConnectionError(Exception): pass
 31 | 
 32 | 
 33 | class CohereVectorizer(BaseVectorizer):
 34 |     """
 35 |     Vectorizes text using models from Cohere via their API.
 36 | 
 37 |     Requires the `cohere` Python library and a Cohere API key. The key can be
 38 |     provided in the `model_config` dictionary or via the `COHERE_API_KEY`
 39 |     environment variable.
 40 | 
 41 |     Attributes:
 42 |         model_name (str): The name of the Cohere model to use.
 43 |         api_key (str): The Cohere API key being used.
 44 |         client (cohere.Client): The Cohere client instance.
 45 |         input_type (str): The input type for the embedding model.
 46 |         truncate (str): The truncation strategy for the model.
 47 |     """
 48 |     DEFAULT_INPUT_TYPE = "search_document"
 49 |     DEFAULT_TRUNCATE = "END"
 50 | 
 51 |     def __init__(self,
 52 |                  model_config: Dict[str, Any],
 53 |                  **kwargs):
 54 |         """
 55 |         Initializes the CohereVectorizer.
 56 | 
 57 |         Args:
 58 |             model_config: A dictionary containing the vectorizer's configuration.
 59 |                           - "model" (str): Mandatory. The name of the model to use.
 60 |                           - "api_key" (str): Optional. Your Cohere API key. If not
 61 |                             provided, the COHERE_API_KEY environment variable is used.
 62 |                           - "input_type" (str): Optional. E.g., "search_document".
 63 |                           - "truncate" (str): Optional. E.g., "END".
 64 | 
 65 |         Raises:
 66 |             ConfigurationError: If 'cohere' is not installed, config is invalid,
 67 |                                 or the API key is missing.
 68 |             VectorizationError: If connection to Cohere fails or the model is invalid.
 69 |         """
 70 |         super().__init__(vectorizer_name="cohere")
 71 |         if not _COHERE_AVAILABLE or cohere is None:
 72 |             raise ConfigurationError("CohereVectorizer requires the 'cohere' library. Install with: pip install safe_store[cohere]")
 73 | 
 74 |         if not isinstance(model_config, dict) or "model" not in model_config:
 75 |             raise ConfigurationError("Cohere vectorizer config must be a dictionary with a 'model' key.")
 76 | 
 77 |         self.model_name: str = model_config["model"]
 78 |         
 79 |         # API key discovery logic
 80 |         chosen_api_key: Optional[str] = model_config.get("api_key")
 81 |         if chosen_api_key:
 82 |             ASCIIColors.info("Using Cohere API key provided in vectorizer_config.")
 83 |         else:
 84 |             ASCIIColors.info("API key not in config. Checking COHERE_API_KEY environment variable.")
 85 |             chosen_api_key = os.environ.get("COHERE_API_KEY")
 86 |         
 87 |         if not chosen_api_key:
 88 |             raise ConfigurationError("Cohere API key not found. Provide it in the 'api_key' field of vectorizer_config or set the COHERE_API_KEY environment variable.")
 89 |         
 90 |         self.api_key: str = chosen_api_key
 91 | 
 92 |         # Get additional parameters from config
 93 |         self.input_type: str = model_config.get("input_type", self.DEFAULT_INPUT_TYPE)
 94 |         self.truncate: str = model_config.get("truncate", self.DEFAULT_TRUNCATE)
 95 |         
 96 |         # Parameter validation
 97 |         valid_input_types = ["search_document", "search_query", "classification", "clustering", "rerank"]
 98 |         if self.input_type not in valid_input_types:
 99 |              ASCIIColors.warning(f"Invalid input_type '{self.input_type}'. Defaulting to '{self.DEFAULT_INPUT_TYPE}'.")
100 |              self.input_type = self.DEFAULT_INPUT_TYPE
101 | 
102 |         valid_truncate_types = ["NONE", "START", "END"]
103 |         if self.truncate not in valid_truncate_types:
104 |              ASCIIColors.warning(f"Invalid truncate type '{self.truncate}'. Defaulting to '{self.DEFAULT_TRUNCATE}'.")
105 |              self.truncate = self.DEFAULT_TRUNCATE
106 | 
107 |         ASCIIColors.info(f"Initializing Cohere client. Model: {self.model_name}, Input Type: {self.input_type}")
108 |         try:
109 |             self.client: cohere.Client = cohere.Client(api_key=self.api_key)
110 | 
111 |             # Test connection and get embedding dimension
112 |             test_prompt = "hello world"
113 |             response = self.client.embed(
114 |                 texts=[test_prompt],
115 |                 model=self.model_name,
116 |                 input_type=self.input_type,
117 |                 truncate=self.truncate
118 |             )
119 | 
120 |             if not hasattr(response, 'embeddings') or not response.embeddings or not response.embeddings[0]:
121 |                 raise VectorizationError(f"Cohere model '{self.model_name}' did not return valid embeddings.")
122 |             
123 |             embedding = response.embeddings[0]
124 | 
125 |             self._dim = len(embedding)
126 |             if self._dim == 0:
127 |                 raise VectorizationError(f"Cohere model '{self.model_name}' returned a zero-dimension embedding.")
128 | 
129 |             self._dtype = np.dtype(np.float32)
130 |             ASCIIColors.info(f"Cohere model '{self.model_name}' ready. Dimension: {self._dim}")
131 | 
132 |         except _CohereAPIError as e:
133 |             msg = f"Cohere API error for model '{self.model_name}': {e}. Check API key and model name."
134 |             if hasattr(e, 'http_status') and e.http_status in [401, 403, 404]:
135 |                  raise ConfigurationError(msg) from e
136 |             raise VectorizationError(msg) from e
137 |         except _CohereConnectionError as e:
138 |             raise VectorizationError(f"Cohere connection error for model '{self.model_name}': {e}.") from e
139 |         except Exception as e:
140 |             raise VectorizationError(f"Failed to initialize Cohere vectorizer '{self.model_name}': {e}") from e
141 | 
142 |     def vectorize(self, texts: List[str]) -> np.ndarray:
143 |         if not texts:
144 |             return np.empty((0, self.dim), dtype=self.dtype)
145 | 
146 |         embeddings_results = [None] * len(texts)
147 |         actual_texts_to_embed: List[str] = []
148 |         original_indices_for_api_texts: List[int] = []
149 | 
150 |         for i, text in enumerate(texts):
151 |             stripped_text = text.strip()
152 |             if not stripped_text:
153 |                 embeddings_results[i] = np.zeros(self.dim, dtype=self._dtype)
154 |             else:
155 |                 actual_texts_to_embed.append(stripped_text)
156 |                 original_indices_for_api_texts.append(i)
157 |         
158 |         if actual_texts_to_embed:
159 |             try:
160 |                 if len(actual_texts_to_embed) > 96:
161 |                      ASCIIColors.warning(f"Attempting to vectorize {len(actual_texts_to_embed)} texts with Cohere, which may exceed batch limits.")
162 | 
163 |                 response = self.client.embed(
164 |                     texts=actual_texts_to_embed,
165 |                     model=self.model_name,
166 |                     input_type=self.input_type,
167 |                     truncate=self.truncate
168 |                 )
169 | 
170 |                 if not hasattr(response, 'embeddings') or len(response.embeddings) != len(actual_texts_to_embed):
171 |                     raise VectorizationError("Cohere API returned a mismatched number of embeddings.")
172 | 
173 |                 for i, embedding_vector in enumerate(response.embeddings):
174 |                     original_idx = original_indices_for_api_texts[i]
175 |                     if not isinstance(embedding_vector, list) or len(embedding_vector) != self.dim:
176 |                         raise VectorizationError(f"Cohere model returned an invalid embedding for text at index {original_idx}.")
177 |                     embeddings_results[original_idx] = embedding_vector
178 | 
179 |             except (_CohereAPIError, _CohereConnectionError) as e:
180 |                 raise VectorizationError(f"Cohere API error during vectorization: {e}") from e
181 |             except Exception as e:
182 |                 raise VectorizationError(f"Unexpected error during Cohere vectorization: {e}") from e
183 | 
184 |         embeddings_array = np.array(embeddings_results, dtype=self._dtype)
185 | 
186 |         if embeddings_array.shape != (len(texts), self.dim):
187 |              raise VectorizationError(f"Cohere vectorization resulted in unexpected shape {embeddings_array.shape}.")
188 | 
189 |         return embeddings_array
190 | 
191 |     @property
192 |     def dim(self) -> int:
193 |         return self._dim
194 | 
195 |     @property
196 |     def dtype(self) -> np.dtype:
197 |         return self._dtype
198 | 
199 |     @staticmethod
200 |     def list_models(**kwargs) -> List[str]:
201 |         """Lists available embedding models from Cohere."""
202 |         return [
203 |             "embed-english-v3.0",
204 |             "embed-multilingual-v3.0",
205 |             "embed-english-light-v3.0",
206 |             "embed-multilingual-light-v3.0",
207 |             "embed-english-v2.0",
208 |             "embed-english-light-v2.0",
209 |             "embed-multilingual-v2.0",
210 |         ]


--------------------------------------------------------------------------------
/examples/basic_usage.py:
--------------------------------------------------------------------------------
  1 | # examples/basic_usage.py
  2 | import safe_store
  3 | from pathlib import Path
  4 | import time
  5 | import shutil
  6 | 
  7 | # --- Configuration ---
  8 | # Activate or deactivate examples for each vectorizer type.
  9 | # Each example will create its own separate database file.
 10 | USE_ST = True       # Sentence-Transformers (local model)
 11 | USE_TFIDF = True    # TF-IDF (local, data-dependent)
 12 | USE_OLLAMA = True   # Ollama (requires running Ollama server)
 13 | USE_OPENAI = False  # OpenAI (requires API key)
 14 | USE_COHERE = False  # Cohere (requires API key)
 15 | USE_PARSING = True  # Set to False if parsing libs not installed
 16 | 
 17 | # --- Vectorizer Configurations ---
 18 | # The new way: define vectorizer type and its config separately.
 19 | st_config = {"model": "all-MiniLM-L6-v2"}
 20 | tfidf_config = {"name": "my_tfidf"} # 'name' is just an identifier for this fitted model
 21 | ollama_config = {"model": "qwen3-embedding:0.6b"} # Ensure you have pulled this model in Ollama
 22 | openai_config = {"model": "text-embedding-3-small"} # Key from OPENAI_API_KEY env var
 23 | cohere_config = {"model": "embed-english-v3.0"} # Key from COHERE_API_KEY env var
 24 | 
 25 | # --- Helper Functions ---
 26 | def print_header(title):
 27 |     print("\n" + "="*10 + f" {title} " + "="*10)
 28 | 
 29 | def cleanup_db_files(db_file):
 30 |     """Cleans up only the database and its associated files."""
 31 |     db_path = Path(db_file)
 32 |     paths_to_delete = [
 33 |         db_path,
 34 |         Path(f"{db_path}.lock"),
 35 |         Path(f"{db_path}-wal"),
 36 |         Path(f"{db_path}-shm")
 37 |     ]
 38 |     for p in paths_to_delete:
 39 |         p.unlink(missing_ok=True)
 40 |     print(f"- Cleaned up database artifacts for {db_file}")
 41 | 
 42 | # --- Document Preparation ---
 43 | def prepare_documents(doc_dir="temp_docs_basic"):
 44 |     DOC_DIR = Path(doc_dir)
 45 |     # Clean up and recreate the directory from scratch at the beginning
 46 |     if DOC_DIR.exists():
 47 |         shutil.rmtree(DOC_DIR)
 48 |     print_header("Preparing Sample Documents")
 49 |     DOC_DIR.mkdir(exist_ok=True)
 50 |     
 51 |     (DOC_DIR / "intro.txt").write_text(
 52 |         "safe_store is a Python library for local vector storage.", encoding='utf-8'
 53 |     )
 54 |     (DOC_DIR / "update_later.txt").write_text(
 55 |         "Initial content for update testing.", encoding='utf-8'
 56 |     )
 57 |     if USE_PARSING:
 58 |         (DOC_DIR / "web_snippet.html").write_text(
 59 |             "<html><body><p>Efficient retrieval is crucial for RAG pipelines.</p></body></html>",
 60 |             encoding='utf-8'
 61 |         )
 62 |     print(f"- Documents created in: {DOC_DIR.resolve()}")
 63 | 
 64 | # --- Main Script ---
 65 | if __name__ == "__main__":
 66 |     # --- Discover and Print Available Vectorizers ---
 67 |     print_header("Discovering Available Vectorizers")
 68 |     available_vectorizers = safe_store.SafeStore.list_available_vectorizers()
 69 |     for vec in available_vectorizers:
 70 |         print(f"\n- Vectorizer: {vec['name']} ({vec.get('title', 'No Title')})")
 71 |         print(f"  Description: {vec.get('description', 'N/A').strip()}")
 72 |         if vec.get('input_parameters'):
 73 |             print("  Parameters:")
 74 |             for param in vec['input_parameters']:
 75 |                 default_val = f" (default: {param['default']})" if 'default' in param else ""
 76 |                 mandatory_flag = "[MANDATORY]" if param.get('mandatory') else "[OPTIONAL]"
 77 |                 print(f"    - {param['name']}: {param.get('description', 'N/A')} {mandatory_flag}{default_val}")
 78 | 
 79 |     DOC_DIR = Path("temp_docs_basic")
 80 |     prepare_documents(DOC_DIR)
 81 | 
 82 |     # --- Example 1: Sentence Transformer (ST) ---
 83 |     if USE_ST:
 84 |         db_file_st = "st_store.db"
 85 |         print_header(f"Sentence Transformer Example (DB: {db_file_st})")
 86 |         cleanup_db_files(db_file_st)
 87 |         try:
 88 |             store_st = safe_store.SafeStore(
 89 |                 db_path=db_file_st,
 90 |                 vectorizer_name="st",
 91 |                 vectorizer_config=st_config,
 92 |                 log_level=safe_store.LogLevel.INFO
 93 |             )
 94 |             with store_st:
 95 |                 store_st.add_document(DOC_DIR / "intro.txt", metadata={"topic": "introduction"})
 96 |                 if USE_PARSING:
 97 |                     store_st.add_document(DOC_DIR / "web_snippet.html", metadata={"source": "web"})
 98 | 
 99 |                 results_st = store_st.query("local database library", top_k=1)
100 |                 if results_st:
101 |                     res = results_st[0]
102 |                     print(f"  Query Result: Score={res['similarity_percent']:.2f}%, Text='{res['chunk_text'][:60]}...'")
103 |                 
104 |                 # NEW: Demonstrate vectorizing with metadata
105 |                 print("\n  Demonstrating vectorization with metadata...")
106 |                 store_st.add_text(
107 |                     unique_id="metadata_vectorization_test",
108 |                     text="This text is about oranges and lemons.",
109 |                     metadata={"topic": "citrus fruits", "author": "test"},
110 |                     vectorize_with_metadata=True, # This is the new option
111 |                     force_reindex=True
112 |                 )
113 |                 # This query should be more similar to the metadata ("citrus") than the other documents.
114 |                 results_meta = store_st.query("information about citrus", top_k=1)
115 |                 if results_meta:
116 |                     res = results_meta[0]
117 |                     print(f"  Querying with metadata context ('citrus'): Score={res['similarity_percent']:.2f}%, Path='{res['file_path']}'")
118 |                     if res['file_path'] == 'metadata_vectorization_test':
119 |                         print("  SUCCESS: The most relevant result came from the document with vectorized metadata.")
120 |                     else:
121 |                         print("  NOTE: The top result was not the one with vectorized metadata, which might happen with some models.")
122 | 
123 |                 print("\n  Demonstrating file update...")
124 |                 (DOC_DIR / "update_later.txt").write_text("This content is new and improved for re-indexing.")
125 |                 store_st.add_document(DOC_DIR / "update_later.txt", force_reindex=True)
126 |                 print("  'update_later.txt' has been re-indexed.")
127 | 
128 |         except safe_store.ConfigurationError as e:
129 |             print(f"  [SKIP] Could not run ST example: {e}")
130 |         except Exception as e:
131 |             print(f"  [ERROR] An unexpected error occurred: {e}")
132 | 
133 |     # --- Example 2: TF-IDF ---
134 |     if USE_TFIDF:
135 |         db_file_tfidf = "tfidf_store.db"
136 |         print_header(f"TF-IDF Example (DB: {db_file_tfidf})")
137 |         cleanup_db_files(db_file_tfidf)
138 |         try:
139 |             store_tfidf = safe_store.SafeStore(
140 |                 db_path=db_file_tfidf,
141 |                 vectorizer_name="tfidf",
142 |                 vectorizer_config=tfidf_config,
143 |                 chunking_strategy='character'
144 |             )
145 |             with store_tfidf:
146 |                 print("  Adding documents (this will fit the TF-IDF model)...")
147 |                 store_tfidf.add_document(DOC_DIR / "intro.txt")
148 |                 if USE_PARSING:
149 |                     store_tfidf.add_document(DOC_DIR / "web_snippet.html")
150 | 
151 |                 results_tfidf = store_tfidf.query("SQLite backend storage", top_k=1)
152 |                 if results_tfidf:
153 |                     res = results_tfidf[0]
154 |                     print(f"  Query Result: Score={res['similarity_percent']:.2f}%, Text='{res['chunk_text'][:60]}...'")
155 |         
156 |         except safe_store.ConfigurationError as e:
157 |             print(f"  [SKIP] Could not run TF-IDF example: {e}")
158 |         except Exception as e:
159 |             print(f"  [ERROR] An unexpected error occurred: {e}")
160 | 
161 |     # --- Example 3: Ollama ---
162 |     if USE_OLLAMA:
163 |         db_file_ollama = "ollama_store.db"
164 |         print_header(f"Ollama Example with Custom Tokenizer (DB: {db_file_ollama})")
165 |         cleanup_db_files(db_file_ollama)
166 |         try:
167 |             available_models = safe_store.SafeStore.list_models("ollama")
168 |             print(f"  Found Ollama models: {available_models}")
169 |             if ollama_config["model"] not in available_models:
170 |                  print(f"  [SKIP] Model '{ollama_config['model']}' not found in Ollama.")
171 |             else:
172 |                 store_ollama = safe_store.SafeStore(
173 |                     db_path=db_file_ollama,
174 |                     vectorizer_name="ollama",
175 |                     vectorizer_config=ollama_config,
176 |                     # --- NOUVEAUTÉ : Utiliser le chunking par token en fournissant un tokenizer personnalisé ---
177 |                     chunking_strategy='token',
178 |                     custom_tokenizer={"name": "tiktoken", "model": "cl100k_base"}
179 |                 )
180 |                 with store_ollama:
181 |                     store_ollama.add_document(DOC_DIR / "intro.txt")
182 |                     results_ollama = store_ollama.query("file-based vector db", top_k=1)
183 |                     if results_ollama:
184 |                         res = results_ollama[0]
185 |                         print(f"  Query Result: Score={res['similarity_percent']:.2f}%, Text='{res['chunk_text'][:60]}...'")
186 | 
187 |         except safe_store.VectorizationError as e:
188 |             print(f"  [SKIP] Could not connect to Ollama server: {e}")
189 |         except Exception as e:
190 |             print(f"  [ERROR] An unexpected error occurred: {e}")
191 | 
192 | 
193 |     # --- API-based examples ---
194 |     if USE_OPENAI:
195 |         db_file_openai = "openai_store.db"
196 |         print_header(f"OpenAI Example (DB: {db_file_openai})")
197 |         cleanup_db_files(db_file_openai)
198 |         try:
199 |             store_openai = safe_store.SafeStore(
200 |                 db_path=db_file_openai,
201 |                 vectorizer_name="openai",
202 |                 vectorizer_config=openai_config,
203 |                 chunking_strategy='character' # Also required for OpenAI
204 |             )
205 |             with store_openai:
206 |                 store_openai.add_document(DOC_DIR / "intro.txt")
207 |                 results_openai = store_openai.query("python tool for embeddings", top_k=1)
208 |                 if results_openai:
209 |                     print(f"  Query Result: Score={results_openai[0]['similarity_percent']:.2f}%")
210 |         except Exception as e:
211 |             print(f"  [ERROR] OpenAI example failed: {e}")
212 | 
213 |     if USE_COHERE:
214 |         db_file_cohere = "cohere_store.db"
215 |         print_header(f"Cohere Example (DB: {db_file_cohere})")
216 |         cleanup_db_files(db_file_cohere)
217 |         try:
218 |             store_cohere = safe_store.SafeStore(
219 |                 db_path=db_file_cohere,
220 |                 vectorizer_name="cohere",
221 |                 vectorizer_config=cohere_config,
222 |                 chunking_strategy='character' # Also required for Cohere
223 |             )
224 |             with store_cohere:
225 |                 store_cohere.add_document(DOC_DIR / "intro.txt")
226 |                 results_cohere = store_cohere.query("library for vector search", top_k=1)
227 |                 if results_cohere:
228 |                     print(f"  Query Result: Score={results_cohere[0]['similarity_percent']:.2f}%")
229 |         except Exception as e:
230 |             print(f"  [ERROR] Cohere example failed: {e}")
231 | 
232 |     print("\n--- Final Cleanup ---")
233 |     if DOC_DIR.exists():
234 |         shutil.rmtree(DOC_DIR)
235 |         print(f"- Removed directory: {DOC_DIR}")
236 |         
237 |     print("\n--- End of Script ---")


--------------------------------------------------------------------------------
/scripts/migration_v1_v2.py:
--------------------------------------------------------------------------------
  1 | # migrate_v1_to_v2_argparse.py
  2 | import sqlite3
  3 | from pathlib import Path
  4 | from typing import Union, Optional, Any
  5 | import argparse
  6 | from ascii_colors import ASCIIColors
  7 | 
  8 | # --- DatabaseError and connect_db remain the same ---
  9 | class DatabaseError(Exception):
 10 |     pass
 11 | 
 12 | def connect_db(db_path: Union[str, Path]) -> sqlite3.Connection:
 13 |     db_path_obj = Path(db_path).resolve()
 14 |     try:
 15 |         db_path_obj.parent.mkdir(parents=True, exist_ok=True)
 16 |         conn = sqlite3.connect(
 17 |             str(db_path_obj),
 18 |             detect_types=sqlite3.PARSE_DECLTYPES,
 19 |             check_same_thread=False
 20 |         )
 21 |         conn.execute("PRAGMA journal_mode=WAL;")
 22 |         conn.execute("PRAGMA foreign_keys = ON;")
 23 |         ASCIIColors.debug(f"Connected to database: {db_path_obj} (WAL enabled)")
 24 |         return conn
 25 |     except sqlite3.Error as e:
 26 |         msg = f"Database connection error to {db_path_obj}: {e}"
 27 |         ASCIIColors.error(msg, exc_info=True)
 28 |         raise DatabaseError(msg) from e
 29 | 
 30 | # --- set_store_metadata and get_store_metadata remain the same ---
 31 | def set_store_metadata(conn: sqlite3.Connection, key: str, value: str) -> None:
 32 |     sql = "INSERT OR REPLACE INTO store_metadata (key, value) VALUES (?, ?)"
 33 |     cursor = conn.cursor()
 34 |     try:
 35 |         cursor.execute(sql, (key, value))
 36 |         ASCIIColors.debug(f"Set store_metadata: {key} = {value}")
 37 |     except sqlite3.Error as e:
 38 |         msg = f"Error setting store metadata '{key}': {e}"
 39 |         ASCIIColors.error(msg, exc_info=True)
 40 |         raise DatabaseError(msg) from e
 41 | 
 42 | def get_store_metadata(conn: sqlite3.Connection, key: str) -> Optional[str]:
 43 |     cursor = conn.cursor()
 44 |     try:
 45 |         cursor.execute("SELECT 1 FROM sqlite_master WHERE type='table' AND name='store_metadata';")
 46 |         if not cursor.fetchone():
 47 |             return None
 48 | 
 49 |         sql = "SELECT value FROM store_metadata WHERE key = ?"
 50 |         cursor.execute(sql, (key,))
 51 |         result = cursor.fetchone()
 52 |         return result[0] if result else None
 53 |     except sqlite3.Error as e:
 54 |         ASCIIColors.warning(f"Could not get store metadata for key '{key}' (may not exist yet): {e}")
 55 |         return None
 56 | 
 57 | def table_exists(cursor: sqlite3.Cursor, table_name: str) -> bool:
 58 |     """Checks if a table exists in the database."""
 59 |     cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name=?;", (table_name,))
 60 |     return cursor.fetchone() is not None
 61 | 
 62 | def migrate_v1_to_v2(db_path: Path, auto_yes: bool = False):
 63 |     """
 64 |     Migrates the SafeStore database from v1.0 schema to v2.0 schema.
 65 |     Adds graph-related tables and columns.
 66 | 
 67 |     Args:
 68 |         db_path: Path object to the database file.
 69 |         auto_yes: If True, skips interactive prompts.
 70 |     """
 71 |     ASCIIColors.info(f"Attempting migration for database: {db_path}")
 72 | 
 73 |     if not db_path.exists():
 74 |         ASCIIColors.error(f"Database file {db_path} does not exist. Cannot migrate.")
 75 |         ASCIIColors.info("If this is a new setup, the main application will initialize it to v2.0.")
 76 |         return False
 77 | 
 78 |     if not auto_yes:
 79 |         ASCIIColors.warning("IMPORTANT: Please backup your database file before proceeding!")
 80 |         try:
 81 |             if not Path("/dev/tty").is_char_device():
 82 |                  ASCIIColors.info("Non-interactive environment detected, proceeding without prompt.")
 83 |             elif input("Press Enter to continue or Ctrl+C to abort..."):
 84 |                 ASCIIColors.info("Migration aborted by user input.")
 85 |                 return False
 86 |         except (EOFError, KeyboardInterrupt):
 87 |             ASCIIColors.info("Migration aborted.")
 88 |             return False
 89 |         except Exception:
 90 |             ASCIIColors.info("Could not get interactive input, proceeding with caution. Use --yes to bypass.")
 91 | 
 92 |     conn = None
 93 |     try:
 94 |         conn = connect_db(db_path)
 95 |         cursor = conn.cursor()
 96 | 
 97 |         # --- Pre-migration V1 Schema Check ---
 98 |         ASCIIColors.info("Performing pre-migration schema check...")
 99 |         required_v1_tables = ["documents", "vectorization_methods", "chunks", "vectors"]
100 |         missing_v1_tables = []
101 |         for table_name in required_v1_tables:
102 |             if not table_exists(cursor, table_name):
103 |                 missing_v1_tables.append(table_name)
104 | 
105 |         if missing_v1_tables:
106 |             ASCIIColors.error(f"The database at '{db_path}' is missing essential v1.0 tables: {', '.join(missing_v1_tables)}.")
107 |             ASCIIColors.error("This script expects a database with a valid v1.0 schema.")
108 |             ASCIIColors.info("If this is an empty database, your application should initialize it directly to v2.0.")
109 |             return False
110 |         ASCIIColors.green("Basic v1.0 schema tables found.")
111 | 
112 | 
113 |         # --- Version Check (after confirming basic tables exist) ---
114 |         current_version = get_store_metadata(conn, 'schema_version')
115 |         if current_version == '2.0':
116 |             ASCIIColors.success(f"Database '{db_path}' is already at schema version 2.0. No migration needed.")
117 |             return True
118 |         elif current_version:
119 |             ASCIIColors.warning(f"Database '{db_path}' has an existing schema version: '{current_version}'.")
120 |             ASCIIColors.warning("This script is designed for v1.0 (no version marker) to v2.0 migration.")
121 |             if not auto_yes:
122 |                 if input(f"Continue migration from '{current_version}' to '2.0'? (yes/NO): ").lower() != 'yes':
123 |                     ASCIIColors.info("Migration aborted by user.")
124 |                     return False
125 |             else:
126 |                 ASCIIColors.info(f"Auto-proceeding with migration from '{current_version}' to '2.0'.")
127 |         else:
128 |              ASCIIColors.info("No schema_version metadata found. Assuming v1.0 database.")
129 | 
130 | 
131 |         ASCIIColors.info("Proceeding with v1.0 to v2.0 migration tasks...")
132 | 
133 |         cursor.execute("PRAGMA foreign_keys=OFF;")
134 | 
135 |         # 1. Add 'graph_processed_at' column and index to 'chunks' table
136 |         ASCIIColors.info("Updating 'chunks' table (guaranteed to exist by pre-check)...")
137 |         cursor.execute("PRAGMA table_info(chunks);")
138 |         columns_in_chunks = [info[1] for info in cursor.fetchall()]
139 |         if 'graph_processed_at' not in columns_in_chunks:
140 |             cursor.execute("ALTER TABLE chunks ADD COLUMN graph_processed_at DATETIME;")
141 |             ASCIIColors.info("Added 'graph_processed_at' column to 'chunks'.")
142 |         cursor.execute("CREATE INDEX IF NOT EXISTS idx_chunk_graph_processed_at ON chunks (graph_processed_at);")
143 |         ASCIIColors.green("'chunks' table updated and indexed.")
144 | 
145 |         # 2. Create 'store_metadata' table
146 |         ASCIIColors.info("Ensuring 'store_metadata' table exists...")
147 |         cursor.execute("""
148 |         CREATE TABLE IF NOT EXISTS store_metadata (key TEXT PRIMARY KEY, value TEXT);
149 |         """)
150 |         ASCIIColors.green("'store_metadata' table ensured.")
151 | 
152 |         # 3. Create 'graph_nodes' table and indexes
153 |         ASCIIColors.info("Ensuring 'graph_nodes' table and indexes...")
154 |         cursor.execute("""
155 |         CREATE TABLE IF NOT EXISTS graph_nodes (
156 |             node_id INTEGER PRIMARY KEY AUTOINCREMENT, node_label TEXT NOT NULL,
157 |             node_properties TEXT, unique_signature TEXT UNIQUE);
158 |         """)
159 |         cursor.execute("CREATE INDEX IF NOT EXISTS idx_graph_node_label ON graph_nodes (node_label);")
160 |         cursor.execute("CREATE INDEX IF NOT EXISTS idx_graph_node_signature ON graph_nodes (unique_signature);")
161 |         ASCIIColors.green("'graph_nodes' table and indexes ensured.")
162 | 
163 |         # 4. Create 'graph_relationships' table and indexes
164 |         ASCIIColors.info("Ensuring 'graph_relationships' table and indexes...")
165 |         cursor.execute("""
166 |         CREATE TABLE IF NOT EXISTS graph_relationships (
167 |             relationship_id INTEGER PRIMARY KEY AUTOINCREMENT, source_node_id INTEGER NOT NULL,
168 |             target_node_id INTEGER NOT NULL, relationship_type TEXT NOT NULL,
169 |             relationship_properties TEXT,
170 |             FOREIGN KEY (source_node_id) REFERENCES graph_nodes (node_id) ON DELETE CASCADE,
171 |             FOREIGN KEY (target_node_id) REFERENCES graph_nodes (node_id) ON DELETE CASCADE);
172 |         """)
173 |         cursor.execute("CREATE INDEX IF NOT EXISTS idx_graph_rel_source_type ON graph_relationships (source_node_id, relationship_type);")
174 |         cursor.execute("CREATE INDEX IF NOT EXISTS idx_graph_rel_target_type ON graph_relationships (target_node_id, relationship_type);")
175 |         cursor.execute("CREATE INDEX IF NOT EXISTS idx_graph_rel_type ON graph_relationships (relationship_type);")
176 |         ASCIIColors.green("'graph_relationships' table and indexes ensured.")
177 | 
178 |         # 5. Create 'node_chunk_links' table and indexes
179 |         ASCIIColors.info("Ensuring 'node_chunk_links' table and indexes...")
180 |         cursor.execute("""
181 |         CREATE TABLE IF NOT EXISTS node_chunk_links (
182 |             node_id INTEGER NOT NULL, chunk_id INTEGER NOT NULL,
183 |             FOREIGN KEY (node_id) REFERENCES graph_nodes (node_id) ON DELETE CASCADE,
184 |             FOREIGN KEY (chunk_id) REFERENCES chunks (chunk_id) ON DELETE CASCADE,
185 |             PRIMARY KEY (node_id, chunk_id));
186 |         """)
187 |         cursor.execute("CREATE INDEX IF NOT EXISTS idx_ncl_node_id ON node_chunk_links (node_id);")
188 |         cursor.execute("CREATE INDEX IF NOT EXISTS idx_ncl_chunk_id ON node_chunk_links (chunk_id);")
189 |         ASCIIColors.green("'node_chunk_links' table and indexes ensured.")
190 | 
191 |         # 6. Update schema version in store_metadata
192 |         ASCIIColors.info("Updating schema version to 2.0 in 'store_metadata'.")
193 |         cursor.execute("INSERT OR REPLACE INTO store_metadata (key, value) VALUES (?, ?)", ('schema_version', '2.0'))
194 | 
195 |         cursor.execute("PRAGMA foreign_keys=ON;")
196 | 
197 |         conn.commit()
198 |         ASCIIColors.success(f"Database migration to v2.0 completed successfully for: {db_path}")
199 |         return True
200 | 
201 |     except sqlite3.Error as e:
202 |         ASCIIColors.error(f"SQLite error during migration: {e}")
203 |         if conn:
204 |             ASCIIColors.warning("Rolling back changes due to error.")
205 |             conn.rollback()
206 |         return False
207 |     except DatabaseError as e:
208 |         ASCIIColors.error(f"Database operation error during migration: {e}")
209 |         if conn:
210 |             ASCIIColors.warning("Rolling back changes due to error.")
211 |             conn.rollback()
212 |         return False
213 |     except Exception as e:
214 |         ASCIIColors.error(f"An unexpected error occurred during migration: {e}", exc_info=True)
215 |         if conn:
216 |             ASCIIColors.warning("Rolling back changes due to error.")
217 |             conn.rollback()
218 |         return False
219 |     finally:
220 |         if conn:
221 |             conn.close()
222 |             ASCIIColors.debug("Database connection closed.")
223 | 
224 | # --- main() function with argparse remains the same ---
225 | def main():
226 |     parser = argparse.ArgumentParser(
227 |         description="Migrate SafeStore SQLite database from v1.0 schema to v2.0 schema.",
228 |         formatter_class=argparse.RawTextHelpFormatter,
229 |         epilog="""
230 | Example usage:
231 |   python %(prog)s /path/to/your/safestore.db
232 |   python %(prog)s my_database.sqlite --yes
233 | 
234 | This script adds new tables and columns for graph database functionality.
235 | It is designed to be run on a database created with a pre-graph version of SafeStore.
236 | Ensure you have a backup of your database before running this script.
237 | """
238 |     )
239 |     parser.add_argument(
240 |         "db_path",
241 |         type=Path,
242 |         help="Path to the SQLite database file to migrate."
243 |     )
244 |     parser.add_argument(
245 |         "--yes",
246 |         "-y",
247 |         action="store_true",
248 |         help="Automatically answer 'yes' to confirmation prompts (use with caution)."
249 |     )
250 | 
251 |     args = parser.parse_args()
252 | 
253 |     if migrate_v1_to_v2(args.db_path, auto_yes=args.yes):
254 |         ASCIIColors.highlight("Migration process finished.")
255 |     else:
256 |         ASCIIColors.critical("Migration process failed or was aborted. Please check the logs.")
257 |         exit(1)
258 | 
259 | if __name__ == "__main__":
260 |     main()
261 | 


--------------------------------------------------------------------------------
/examples/graph_usage.py:
--------------------------------------------------------------------------------
  1 | # [FINAL & ROBUST] examples/graph_usage.py
  2 | import safe_store
  3 | from safe_store import GraphStore, LogLevel, SafeStore
  4 | import pipmaster as pm
  5 | 
  6 | pm.ensure_packages(["lollms_client"])
  7 | from lollms_client import LollmsClient
  8 | from ascii_colors import ASCIIColors, trace_exception
  9 | import sqlite3
 10 | from pathlib import Path
 11 | import json
 12 | import shutil
 13 | from typing import Dict, List, Any, Optional
 14 | 
 15 | # --- Configuration ---
 16 | DB_FILE = "graph_example_store.db"
 17 | DOC_DIR = Path("temp_docs_graph_example")
 18 | 
 19 | # --- LOLLMS Client Configuration ---
 20 | BINDING_NAME = "ollama"
 21 | HOST_ADDRESS = "http://localhost:11434"
 22 | MODEL_NAME = "mistral:latest"
 23 | 
 24 | # --- Ontology Definitions ---
 25 | DETAILED_ONTOLOGY = {
 26 |     "nodes": {
 27 |         "Person": {"description": "A human individual.", "properties": {"name": "string", "title": "string"}},
 28 |         "Company": {"description": "A commercial business.", "properties": {"name": "string", "location": "string"}},
 29 |         "Product": {"description": "A product created by a company.", "properties": {"name": "string"}},
 30 |         "ResearchPaper": {"description": "An academic publication.", "properties": {"title": "string"}},
 31 |         "University": {"description": "An institution of higher education.", "properties": {"name": "string"}}
 32 |     },
 33 |     "relationships": {
 34 |         "WORKS_AT": {"description": "Person is employed by Company.", "source": "Person", "target": "Company"},
 35 |         "CEO_OF": {"description": "Person is the CEO of Company.", "source": "Person", "target": "Company"},
 36 |         "FOUNDED": {"description": "Person founded a Company.", "source": "Person", "target": "Company"},
 37 |         "COMPETITOR_OF": {"description": "Company is a competitor of another Company.", "source": "Company", "target": "Company"},
 38 |         "PRODUCES": {"description": "Company creates a Product.", "source": "Company", "target": "Product"},
 39 |         "AUTHOR_OF": {"description": "Person wrote a ResearchPaper.", "source": "Person", "target": "ResearchPaper"},
 40 |         "AFFILIATED_WITH": {"description": "Person is associated with a University.", "source": "Person", "target": "University"}
 41 |     }
 42 | }
 43 | SIMPLE_ONTOLOGY = {
 44 |     "nodes": {"Entity": {"description": "A person, company, or organization.", "properties": {"name": "string"}}},
 45 |     "relationships": {"IS_RELATED_TO": {"description": "Indicates a general connection between two entities.", "source": "Entity", "target": "Entity"}}
 46 | }
 47 | 
 48 | # NEW: Ontology as a simple string of instructions
 49 | STRING_ONTOLOGY = """
 50 | - Extract People, Companies, and Products as nodes.
 51 | - For 'People' nodes, extract their full name and any job title mentioned as properties.
 52 | - For 'Companies' nodes, extract their full name and location as properties.
 53 | - For 'Products' nodes, extract their name.
 54 | - Create relationships like WORKS_AT, CEO_OF, and PRODUCES between these nodes.
 55 | """
 56 | 
 57 | 
 58 | LC_CLIENT: Optional[LollmsClient] = None
 59 | 
 60 | def initialize_lollms_client() -> bool:
 61 |     global LC_CLIENT
 62 |     if LC_CLIENT is None:
 63 |         ASCIIColors.info(f"Initializing LollmsClient: Binding='{BINDING_NAME}', Host='{HOST_ADDRESS}', Model='{MODEL_NAME}'")
 64 |         try:
 65 |             LC_CLIENT = LollmsClient(llm_binding_name=BINDING_NAME, llm_binding_config={"host_address": HOST_ADDRESS, "model_name": MODEL_NAME})
 66 |             if not LC_CLIENT.llm:
 67 |                  ASCIIColors.error(f"LollmsClient binding '{BINDING_NAME}' is not ready."); LC_CLIENT = None; return False
 68 |             ASCIIColors.success("LollmsClient initialized and ready.")
 69 |             return True
 70 |         except Exception as e:
 71 |             ASCIIColors.error(f"Failed to initialize LollmsClient: {e}"); trace_exception(e); LC_CLIENT = None; return False
 72 |     return True
 73 | 
 74 | def llm_executor_callback(full_prompt: str) -> str:
 75 |     global LC_CLIENT
 76 |     if LC_CLIENT is None: raise ConnectionError("LollmsClient not initialized.")
 77 |     try:
 78 |         return LC_CLIENT.generate_code(full_prompt, language="json", temperature=0.05, top_k=10)
 79 |     except Exception as e:
 80 |         raise RuntimeError(f"LLM execution for JSON failed: {e}") from e
 81 | 
 82 | def generate_answer_from_context(question: str, graph_data: Dict, chunks_data: Optional[List[Dict]] = None) -> str:
 83 |     global LC_CLIENT
 84 |     if LC_CLIENT is None: return "LLM not available."
 85 |     context_lines = ["--- CONTEXT ---"]
 86 |     if graph_data and graph_data.get("nodes"):
 87 |         context_lines.append("\n[Graph Information]:")
 88 |         node_map = {n['node_id']: n for n in graph_data['nodes']}
 89 | 
 90 |         def get_node_instance_name(node_id: int) -> str:
 91 |             """Helper to get the best possible name for a node instance."""
 92 |             node = node_map.get(node_id)
 93 |             if not node:
 94 |                 return f"ID:{node_id}"
 95 |             props = node.get('properties', {})
 96 |             # Prioritize 'identifying_value', then 'name', then 'title' before falling back to ID.
 97 |             return props.get('identifying_value') or props.get('name') or props.get('title') or f"ID:{node_id}"
 98 | 
 99 |         for node in graph_data['nodes']:
100 |             instance_name = get_node_instance_name(node['node_id'])
101 |             context_lines.append(f"- Instance '{instance_name}' (type: {node['label']}): {json.dumps(node.get('properties', {}))}")
102 |         
103 |         for rel in graph_data.get('relationships', []):
104 |             src_name = get_node_instance_name(rel['source_node_id'])
105 |             tgt_name = get_node_instance_name(rel['target_node_id'])
106 |             context_lines.append(f"- Relationship: '{src_name}' --[{rel['type']}]--> '{tgt_name}'")
107 | 
108 |     if chunks_data:
109 |         context_lines.append("\n[Relevant Text Snippets]:")
110 |         for i, chunk in enumerate(chunks_data):
111 |             context_lines.append(f"- Snippet {i+1}: \"{chunk['chunk_text']}\"")
112 |     context_lines.append("\n--- END OF CONTEXT ---")
113 |     context_str = "\n".join(context_lines)
114 | 
115 |     prompt = (f"Answer the user's question based ONLY on the provided context. Do not use prior knowledge.\n\n"
116 |               f"{context_str}\n\nQuestion: {question}")
117 |     
118 |     ASCIIColors.magenta("--- Sending Synthesis Prompt to LLM ---")
119 |     try:
120 |         return LC_CLIENT.generate_text(prompt, n_predict=512)
121 |     except Exception as e:
122 |         ASCIIColors.error(f"Error during answer synthesis: {e}")
123 |         return "Error generating the answer."
124 | 
125 | def print_header(title: str):
126 |     print("\n" + "="*25 + f" {title} " + "="*25)
127 | 
128 | def cleanup():
129 |     print_header("Cleaning Up Previous Run")
130 |     paths = [Path(DB_FILE), Path(f"{DB_FILE}.lock"), Path(f"{DB_FILE}-wal"), Path(f"{DB_FILE}-shm"), DOC_DIR]
131 |     for p in paths:
132 |         try:
133 |             if p.is_file(): p.unlink(missing_ok=True); print(f"- Removed file: {p}")
134 |             elif p.is_dir(): shutil.rmtree(p, ignore_errors=True); print(f"- Removed directory: {p}")
135 |         except OSError as e: print(f"- Warning: Could not remove {p}: {e}")
136 | 
137 | def clear_graph_data(conn: sqlite3.Connection):
138 |     ASCIIColors.warning("\nClearing all existing graph data from the database...")
139 |     try:
140 |         conn.execute("BEGIN")
141 |         conn.execute("DELETE FROM node_chunk_links;")
142 |         conn.execute("DELETE FROM graph_relationships;")
143 |         conn.execute("DELETE FROM graph_nodes;")
144 |         conn.execute("UPDATE chunks SET graph_processed_at = NULL;")
145 |         conn.commit()
146 |         ASCIIColors.success("Graph data cleared.")
147 |     except sqlite3.Error as e:
148 |         conn.rollback()
149 |         ASCIIColors.error(f"Failed to clear graph data: {e}")
150 | 
151 | if __name__ == "__main__":
152 |     cleanup()
153 |     if not initialize_lollms_client():
154 |         ASCIIColors.error("Exiting: LollmsClient initialization failure."); exit(1)
155 | 
156 |     ASCIIColors.set_log_level(LogLevel.INFO)
157 |     
158 |     try:
159 |         print_header("Preparing Documents (One-time setup)")
160 |         DOC_DIR.mkdir(exist_ok=True, parents=True)
161 |         doc1_content = "Acme Innovations, led by CEO Dr. Evelyn Reed, is a tech company based in Silicon Valley. Their flagship product, 'NovaCore', was launched in 2023. John Doe works as a Senior Engineer at Acme Innovations and reports to Dr. Reed. Acme Innovations is a competitor of Beta Solutions."
162 |         (DOC_DIR / "company_info.txt").write_text(doc1_content.strip(), encoding='utf-8')
163 |         doc2_content = "The research paper 'Quantum Entanglement in Nanostructures' by Dr. Alice Smith cites work by Dr. Evelyn Reed on early quantum theories. Dr. Reed is also known for her work at Acme Innovations."
164 |         (DOC_DIR / "research_paper_snippet.txt").write_text(doc2_content.strip(), encoding='utf-8')
165 | 
166 |         with SafeStore(db_path=DB_FILE) as store:
167 |             store.add_document(DOC_DIR / "company_info.txt")
168 |             store.add_document(DOC_DIR / "research_paper_snippet.txt")
169 |             
170 |             print_header("PASS 1: Building Graph with DETAILED Ontology")
171 |             graph_store_detailed = GraphStore(store=store, llm_executor_callback=llm_executor_callback, ontology=DETAILED_ONTOLOGY)
172 |             graph_store_detailed.build_graph_for_all_documents()
173 |             ASCIIColors.success("Graph building with detailed ontology complete.")
174 | 
175 |             print_header("DEMO 1.1: RAG Query (Who is Dr. Evelyn Reed?)")
176 |             query = "Who is Dr. Evelyn Reed and what companies is she associated with?"
177 |             result = graph_store_detailed.query_graph(query, output_mode="full")
178 |             full_answer = generate_answer_from_context(query, result.get('graph'), result.get('chunks'))
179 |             ASCIIColors.green("Final Answer (from Graph + Chunks):")
180 |             print(full_answer)
181 | 
182 |             print_header("DEMO 1.2: Manually Editing the Graph")
183 |             ASCIIColors.info("We will manually add a new product 'ChronoLeap' and link it to an 'Acme' company.")
184 |             
185 |             company_nodes = graph_store_detailed.get_nodes_by_label("Company")
186 |             acme_node = next((n for n in company_nodes if 'acme' in n.get('properties', {}).get('name', '').lower()), None)
187 | 
188 |             if acme_node:
189 |                 acme_id = acme_node['node_id']
190 |                 acme_name = acme_node['properties']['name']
191 |                 ASCIIColors.info(f"Found '{acme_name}' with Node ID: {acme_id}")
192 |                 
193 |                 product_id = graph_store_detailed.add_node(label="Product", properties={"name": "ChronoLeap"})
194 |                 ASCIIColors.info(f"Created new 'ChronoLeap' product with Node ID: {product_id}")
195 |                 
196 |                 rel_id = graph_store_detailed.add_relationship(acme_id, product_id, "PRODUCES")
197 |                 ASCIIColors.info(f"Linked them with 'PRODUCES' relationship (ID: {rel_id})")
198 | 
199 |                 print_header("DEMO 1.3: Querying the Manually Added Data")
200 |                 manual_query = "What new products does Acme produce?"
201 |                 manual_result = graph_store_detailed.query_graph(manual_query, output_mode="full")
202 |                 manual_answer = generate_answer_from_context(manual_query, manual_result.get('graph'))
203 |                 ASCIIColors.green("Final Answer (from Graph-Only):")
204 |                 print(manual_answer)
205 |             else:
206 |                 ASCIIColors.warning("Could not find any 'Acme' company node to perform manual edit demo.")
207 | 
208 |             print_header("PASS 2: Rebuilding Graph with SIMPLE Ontology")
209 |             clear_graph_data(store.conn)
210 | 
211 |             graph_store_simple = GraphStore(store=store, llm_executor_callback=llm_executor_callback, ontology=SIMPLE_ONTOLOGY)
212 |             graph_store_simple.build_graph_for_all_documents()
213 |             ASCIIColors.success("Graph building with simple ontology complete.")
214 |             
215 |             print_header("DEMO 2.1: Observing the new simple graph structure")
216 |             simple_nodes = graph_store_simple.get_nodes_by_label("Entity", limit=10)
217 |             ASCIIColors.blue("\nNodes extracted with the simple 'Entity' label:")
218 |             if simple_nodes:
219 |                 for n in simple_nodes: print(f"  - ID: {n['node_id']}, Props: {n.get('properties')}")
220 |             else:
221 |                 print("  No 'Entity' nodes found.")
222 |             
223 |             print_header("PASS 3: Rebuilding Graph with STRING-BASED Ontology")
224 |             clear_graph_data(store.conn)
225 | 
226 |             graph_store_string = GraphStore(store=store, llm_executor_callback=llm_executor_callback, ontology=STRING_ONTOLOGY)
227 |             graph_store_string.build_graph_for_all_documents()
228 |             ASCIIColors.success("Graph building with string-based ontology complete.")
229 |             
230 |             print_header("DEMO 3.1: Observing the graph from string ontology")
231 |             string_nodes_viz = graph_store_string.get_all_nodes_for_visualization(limit=15)
232 |             ASCIIColors.blue("\nNodes extracted with the string ontology:")
233 |             if string_nodes_viz:
234 |                 for n in string_nodes_viz: print(f"  - Label: {n['label']}, Props: {n.get('properties')}")
235 |             else:
236 |                 print("  No nodes found.")
237 | 
238 | 
239 |     except Exception as e:
240 |         ASCIIColors.error(f"An unexpected error occurred in the main process: {e}")
241 |         trace_exception(e)
242 |     finally:
243 |         print_header("Example Finished")
244 |         ASCIIColors.info(f"Database file is at: {Path(DB_FILE).resolve()}")


--------------------------------------------------------------------------------