├── docpixie ├── py.typed ├── core │ ├── __init__.py │ ├── utils.py │ └── config.py ├── cli │ ├── __init__.py │ ├── widgets │ │ ├── __init__.py │ │ └── command_palette.py │ ├── styles.py │ ├── task_display.py │ ├── commands.py │ ├── config.py │ ├── state_manager.py │ ├── event_handlers.py │ ├── docpixie_manager.py │ ├── legacy.py │ └── conversation_storage.py ├── ai │ ├── __init__.py │ ├── query_classifier.py │ ├── query_reformulator.py │ ├── summarizer.py │ ├── page_selector.py │ ├── synthesizer.py │ └── context_processor.py ├── utils │ ├── __init__.py │ └── async_helpers.py ├── models │ ├── __init__.py │ ├── agent.py │ └── document.py ├── storage │ ├── __init__.py │ ├── base.py │ └── memory.py ├── processors │ ├── __init__.py │ ├── base.py │ ├── factory.py │ ├── image.py │ └── pdf.py ├── providers │ ├── __init__.py │ ├── factory.py │ ├── base.py │ ├── openai.py │ ├── openrouter.py │ └── anthropic.py ├── __init__.py ├── cli.py └── exceptions.py ├── screenshot.png ├── setup.py ├── MANIFEST.in ├── requirements.txt ├── LICENSE ├── pyproject.toml ├── .gitignore ├── README.md ├── CLAUDE.md └── docs └── cli-tool.md /docpixie/py.typed: -------------------------------------------------------------------------------- 1 | # Marker file for PEP 561 -------------------------------------------------------------------------------- /screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qnguyen3/docpixie/HEAD/screenshot.png -------------------------------------------------------------------------------- /docpixie/core/__init__.py: -------------------------------------------------------------------------------- 1 | """Core DocPixie components""" 2 | 3 | from .config import DocPixieConfig 4 | 5 | __all__ = ["DocPixieConfig"] -------------------------------------------------------------------------------- /docpixie/cli/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | DocPixie CLI - Terminal User Interface for document chat 3 | """ 4 | 5 | from .app import main 6 | 7 | __all__ = ["main"] -------------------------------------------------------------------------------- /docpixie/ai/__init__.py: -------------------------------------------------------------------------------- 1 | """AI operations and business logic components""" 2 | 3 | from .summarizer import PageSummarizer 4 | 5 | __all__ = [ 6 | "PageSummarizer" 7 | ] -------------------------------------------------------------------------------- /docpixie/utils/__init__.py: -------------------------------------------------------------------------------- 1 | """Utility functions and helpers""" 2 | 3 | from .async_helpers import sync_wrapper, ensure_async 4 | 5 | __all__ = [ 6 | "sync_wrapper", 7 | "ensure_async" 8 | ] -------------------------------------------------------------------------------- /docpixie/models/__init__.py: -------------------------------------------------------------------------------- 1 | """Document models and data structures""" 2 | 3 | from .document import Document, Page, QueryResult, QueryMode 4 | 5 | __all__ = ["Document", "Page", "QueryResult", "QueryMode"] -------------------------------------------------------------------------------- /docpixie/storage/__init__.py: -------------------------------------------------------------------------------- 1 | """Storage backends for documents and metadata""" 2 | 3 | from .base import BaseStorage 4 | from .local import LocalStorage 5 | from .memory import InMemoryStorage 6 | 7 | __all__ = [ 8 | "BaseStorage", 9 | "LocalStorage", 10 | "InMemoryStorage" 11 | ] -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Setup script for DocPixie package. 4 | This file exists for backward compatibility with older pip versions. 5 | The actual configuration is in pyproject.toml. 6 | """ 7 | 8 | from setuptools import setup 9 | 10 | if __name__ == "__main__": 11 | setup() -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include LICENSE 3 | include requirements.txt 4 | recursive-include docpixie *.py 5 | include docpixie/py.typed 6 | recursive-include docs *.md 7 | recursive-exclude * __pycache__ 8 | recursive-exclude * *.py[co] 9 | recursive-exclude tests * 10 | recursive-exclude documents * 11 | recursive-exclude docpixie_data * -------------------------------------------------------------------------------- /docpixie/processors/__init__.py: -------------------------------------------------------------------------------- 1 | """Document processors for different file types""" 2 | 3 | from .base import BaseProcessor 4 | from .pdf import PDFProcessor 5 | from .image import ImageProcessor 6 | from .factory import ProcessorFactory 7 | 8 | __all__ = [ 9 | "BaseProcessor", 10 | "PDFProcessor", 11 | "ImageProcessor", 12 | "ProcessorFactory" 13 | ] -------------------------------------------------------------------------------- /docpixie/providers/__init__.py: -------------------------------------------------------------------------------- 1 | """Vision AI providers for DocPixie""" 2 | 3 | from .base import BaseProvider 4 | from .openai import OpenAIProvider 5 | from .anthropic import AnthropicProvider 6 | from .openrouter import OpenRouterProvider 7 | from .factory import create_provider 8 | 9 | __all__ = [ 10 | "BaseProvider", 11 | "OpenAIProvider", 12 | "AnthropicProvider", 13 | "OpenRouterProvider", 14 | "create_provider" 15 | ] -------------------------------------------------------------------------------- /docpixie/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | DocPixie - Simplified Multimodal RAG Library 3 | 4 | A lightweight, vision-based document question-answering system 5 | that doesn't require vector databases or embedding models. 6 | """ 7 | 8 | __version__ = "0.1.0" 9 | 10 | from .docpixie import DocPixie 11 | from .models.document import Document, Page, QueryResult, QueryMode 12 | from .models.agent import ConversationMessage 13 | from .core.config import DocPixieConfig 14 | from .providers import BaseProvider, create_provider 15 | 16 | __all__ = [ 17 | "DocPixie", 18 | "Document", 19 | "Page", 20 | "QueryResult", 21 | "QueryMode", 22 | "ConversationMessage", 23 | "DocPixieConfig", 24 | "BaseProvider", 25 | "create_provider" 26 | ] -------------------------------------------------------------------------------- /docpixie/cli.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | DocPixie CLI - Modern terminal interface for document chat 4 | """ 5 | 6 | import sys 7 | 8 | 9 | def main(): 10 | """Main entry point for DocPixie CLI""" 11 | try: 12 | # Try to import and use the new Textual CLI 13 | from docpixie.cli.app import main as textual_main 14 | textual_main() 15 | except ImportError as e: 16 | # Fallback to legacy CLI if Textual is not installed 17 | print("Note: Textual not installed. Using legacy CLI.") 18 | print("Install with: pip install textual>=0.47.0") 19 | print("") 20 | 21 | from docpixie.cli.legacy import main as legacy_main 22 | legacy_main() 23 | 24 | 25 | if __name__ == "__main__": 26 | main() 27 | -------------------------------------------------------------------------------- /docpixie/cli/widgets/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | DocPixie CLI Widgets 3 | """ 4 | 5 | from .command_palette import DocPixieCommandPalette as CommandPalette, CommandSelected, CommandAutoComplete 6 | from .conversation_manager import ConversationManagerDialog, ConversationSelected, ConversationDeleted 7 | from .model_selector import ModelSelectorDialog, ModelSelected 8 | from .document_manager import DocumentManagerDialog, DocumentRemoved, DocumentsIndexed 9 | from .chat_area import ChatArea 10 | 11 | __all__ = [ 12 | "CommandPalette", "CommandSelected", "CommandAutoComplete", 13 | "ConversationManagerDialog", "ConversationSelected", "ConversationDeleted", 14 | "ModelSelectorDialog", "ModelSelected", 15 | "DocumentManagerDialog", "DocumentRemoved", "DocumentsIndexed", 16 | "ChatArea" 17 | ] -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # DocPixie Open Source Library Dependencies 2 | 3 | # Core dependencies 4 | Pillow>=10.0.0 # Image processing 5 | PyMuPDF>=1.23.0 # PDF processing (replaces pdf2image) 6 | 7 | # Optional AI provider dependencies 8 | openai>=1.0.0 # OpenAI GPT-4V (optional) 9 | anthropic>=0.10.0 # Anthropic Claude (optional) 10 | 11 | # CLI dependencies 12 | textual>=0.47.0 # Terminal UI framework 13 | textual-dev>=1.3.0 # Development tools for Textual 14 | pyfiglet>=0.8.0 # ASCII art text generation 15 | 16 | # Development and testing 17 | pytest>=7.0.0 # Testing framework 18 | pytest-asyncio>=0.21.0 # Async testing support 19 | 20 | # Optional dependencies for different storage backends 21 | # boto3>=1.28.0 # AWS S3 support (optional) 22 | # azure-storage-blob # Azure Blob support (optional) -------------------------------------------------------------------------------- /docpixie/core/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Core utility functions for DocPixie 3 | """ 4 | import re 5 | 6 | 7 | def sanitize_llm_json(response: str) -> str: 8 | """ 9 | Sanitize JSON response from LLM by removing markdown code blocks and extra whitespace. 10 | 11 | LLMs sometimes wrap JSON responses with markdown code blocks like: 12 | ```json 13 | {"key": "value"} 14 | ``` 15 | 16 | This function strips those wrappers and returns clean JSON. 17 | 18 | Args: 19 | response: Raw response string from LLM 20 | 21 | Returns: 22 | Sanitized JSON string ready for json.loads() 23 | """ 24 | # Strip leading/trailing whitespace 25 | cleaned = response.strip() 26 | 27 | # Remove markdown code block wrappers 28 | # Matches ```json...``` or ```...``` patterns 29 | code_block_pattern = r'^```(?:json)?\s*\n?(.*?)\n?```$' 30 | match = re.match(code_block_pattern, cleaned, re.DOTALL | re.IGNORECASE) 31 | 32 | if match: 33 | cleaned = match.group(1).strip() 34 | 35 | return cleaned -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 DocPixie Team 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /docpixie/exceptions.py: -------------------------------------------------------------------------------- 1 | """ 2 | Custom exceptions for DocPixie RAG Agent 3 | """ 4 | 5 | 6 | class DocPixieError(Exception): 7 | """Base exception for DocPixie errors""" 8 | pass 9 | 10 | 11 | class ContextProcessingError(DocPixieError): 12 | """Error occurred during conversation context processing""" 13 | pass 14 | 15 | 16 | class QueryReformulationError(DocPixieError): 17 | """Error occurred during query reformulation""" 18 | pass 19 | 20 | 21 | class QueryClassificationError(DocPixieError): 22 | """Error occurred during query classification""" 23 | pass 24 | 25 | 26 | class TaskPlanningError(DocPixieError): 27 | """Error occurred during task planning or document selection""" 28 | pass 29 | 30 | 31 | class PageSelectionError(DocPixieError): 32 | """Error occurred during page selection""" 33 | pass 34 | 35 | 36 | class TaskAnalysisError(DocPixieError): 37 | """Error occurred during task analysis""" 38 | pass 39 | 40 | 41 | class ResponseSynthesisError(DocPixieError): 42 | """Error occurred during response synthesis""" 43 | pass 44 | 45 | 46 | class DocumentSelectionError(DocPixieError): 47 | """Error occurred during document selection""" 48 | pass 49 | 50 | 51 | class PlanUpdateError(DocPixieError): 52 | """Error occurred during adaptive plan updates""" 53 | pass -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "docpixie" 7 | version = "0.1.0" 8 | description = "A lightweight, vision-based document question-answering system" 9 | readme = "README.md" 10 | license = "MIT" 11 | authors = [ 12 | {name = "DocPixie Team"}, 13 | ] 14 | maintainers = [ 15 | {name = "DocPixie Team"}, 16 | ] 17 | classifiers = [ 18 | "Development Status :: 3 - Alpha", 19 | "Intended Audience :: Developers", 20 | "Programming Language :: Python :: 3", 21 | "Programming Language :: Python :: 3.8", 22 | "Programming Language :: Python :: 3.9", 23 | "Programming Language :: Python :: 3.10", 24 | "Programming Language :: Python :: 3.11", 25 | "Programming Language :: Python :: 3.12", 26 | "Topic :: Software Development :: Libraries :: Python Modules", 27 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 28 | ] 29 | requires-python = ">=3.8" 30 | dependencies = [ 31 | "Pillow>=10.0.0", 32 | "PyMuPDF>=1.23.0", 33 | "openai>=1.0.0", 34 | "anthropic>=0.10.0", 35 | "textual>=0.47.0", 36 | "textual-dev>=1.3.0", 37 | "pyfiglet>=0.8.0", 38 | ] 39 | 40 | [project.optional-dependencies] 41 | dev = [ 42 | "pytest>=7.0.0", 43 | "pytest-asyncio>=0.21.0", 44 | "build>=1.0.0", 45 | "twine>=4.0.0", 46 | ] 47 | 48 | [project.urls] 49 | Homepage = "https://github.com/qnguyen3/docpixie" 50 | Documentation = "https://github.com/qnguyen3/docpixie#readme" 51 | Repository = "https://github.com/qnguyen3/docpixie.git" 52 | Issues = "https://github.com/qnguyen3/docpixie/issues" 53 | 54 | [project.scripts] 55 | docpixie = "docpixie.cli:main" 56 | 57 | [tool.setuptools] 58 | packages = ["docpixie", "docpixie.ai", "docpixie.cli", "docpixie.core", "docpixie.models", "docpixie.processors", "docpixie.providers", "docpixie.storage", "docpixie.utils"] 59 | 60 | [tool.setuptools.package-data] 61 | docpixie = ["py.typed"] -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .nox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | *.py,cover 48 | .hypothesis/ 49 | .pytest_cache/ 50 | cover/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | local_settings.py 59 | db.sqlite3 60 | db.sqlite3-journal 61 | 62 | # Flask stuff: 63 | instance/ 64 | .webassets-cache 65 | 66 | # Scrapy stuff: 67 | .scrapy 68 | 69 | # Sphinx documentation 70 | docs/_build/ 71 | 72 | # PyBuilder 73 | .pybuilder/ 74 | target/ 75 | 76 | # Jupyter Notebook 77 | .ipynb_checkpoints 78 | 79 | # IPython 80 | profile_default/ 81 | ipython_config.py 82 | 83 | # pyenv 84 | .python-version 85 | 86 | # pipenv 87 | Pipfile.lock 88 | 89 | # poetry 90 | poetry.lock 91 | 92 | # pdm 93 | .pdm.toml 94 | 95 | # PEP 582 96 | __pypackages__/ 97 | 98 | # Celery stuff 99 | celerybeat-schedule 100 | celerybeat.pid 101 | 102 | # SageMath parsed files 103 | *.sage.py 104 | 105 | # Environments 106 | .env 107 | .venv 108 | env/ 109 | venv/ 110 | ENV/ 111 | env.bak/ 112 | venv.bak/ 113 | 114 | # Spyder project settings 115 | .spyderproject 116 | .spyderproject 117 | 118 | # Rope project settings 119 | .ropeproject 120 | 121 | # mkdocs documentation 122 | /site 123 | 124 | # mypy 125 | .mypy_cache/ 126 | .dmypy.json 127 | dmypy.json 128 | 129 | # Pyre type checker 130 | .pyre/ 131 | 132 | # pytype static type analyzer 133 | .pytype/ 134 | 135 | # Cython debug symbols 136 | cython_debug/ 137 | 138 | # VS Code 139 | .vscode/ 140 | 141 | # PyCharm 142 | .idea/ 143 | 144 | # macOS 145 | .DS_Store 146 | 147 | # DocPixie specific 148 | docpixie_data/ 149 | documents/*.pdf 150 | *.pdf.json 151 | 152 | # Claude AI 153 | .claude/ -------------------------------------------------------------------------------- /docpixie/utils/async_helpers.py: -------------------------------------------------------------------------------- 1 | """ 2 | Async/sync compatibility helpers 3 | """ 4 | 5 | import asyncio 6 | import threading 7 | from typing import Any, Awaitable, TypeVar 8 | from functools import wraps 9 | 10 | T = TypeVar('T') 11 | 12 | 13 | def sync_wrapper(coro: Awaitable[T]) -> T: 14 | """ 15 | Run async function in sync context 16 | Handles both cases: existing event loop and no event loop 17 | """ 18 | try: 19 | # Try to get the current event loop 20 | loop = asyncio.get_running_loop() 21 | # We're in an async context, need to run in a new thread 22 | return _run_in_thread(coro) 23 | except RuntimeError: 24 | # No running event loop, safe to use asyncio.run 25 | return asyncio.run(coro) 26 | 27 | 28 | def _run_in_thread(coro: Awaitable[T]) -> T: 29 | """Run coroutine in a separate thread with its own event loop""" 30 | result = {"value": None, "exception": None} 31 | 32 | def thread_target(): 33 | try: 34 | # Create new event loop for this thread 35 | new_loop = asyncio.new_event_loop() 36 | asyncio.set_event_loop(new_loop) 37 | result["value"] = new_loop.run_until_complete(coro) 38 | except Exception as e: 39 | result["exception"] = e 40 | finally: 41 | new_loop.close() 42 | 43 | thread = threading.Thread(target=thread_target) 44 | thread.start() 45 | thread.join() 46 | 47 | if result["exception"]: 48 | raise result["exception"] 49 | 50 | return result["value"] 51 | 52 | 53 | def ensure_async(func): 54 | """ 55 | Decorator to ensure function is async-compatible 56 | If the function is sync, wrap it to run in thread pool 57 | """ 58 | if asyncio.iscoroutinefunction(func): 59 | return func 60 | 61 | @wraps(func) 62 | async def async_wrapper(*args, **kwargs): 63 | loop = asyncio.get_event_loop() 64 | return await loop.run_in_executor(None, lambda: func(*args, **kwargs)) 65 | 66 | return async_wrapper 67 | 68 | 69 | def make_sync_version(async_func): 70 | """ 71 | Create a synchronous version of an async function 72 | """ 73 | @wraps(async_func) 74 | def sync_version(*args, **kwargs): 75 | coro = async_func(*args, **kwargs) 76 | return sync_wrapper(coro) 77 | 78 | return sync_version -------------------------------------------------------------------------------- /docpixie/providers/factory.py: -------------------------------------------------------------------------------- 1 | """ 2 | Provider factory for creating AI vision providers 3 | """ 4 | 5 | from typing import Union 6 | 7 | from .base import BaseProvider 8 | from .openai import OpenAIProvider 9 | from .anthropic import AnthropicProvider 10 | from .openrouter import OpenRouterProvider 11 | from ..core.config import DocPixieConfig 12 | 13 | 14 | def create_provider(config: DocPixieConfig) -> BaseProvider: 15 | """ 16 | Create AI provider based on configuration 17 | 18 | Args: 19 | config: DocPixie configuration 20 | 21 | Returns: 22 | Configured provider instance 23 | 24 | Raises: 25 | ValueError: If provider is not supported 26 | """ 27 | if config.provider == "openai": 28 | return OpenAIProvider(config) 29 | elif config.provider == "anthropic": 30 | return AnthropicProvider(config) 31 | elif config.provider == "openrouter": 32 | return OpenRouterProvider(config) 33 | else: 34 | raise ValueError(f"Unsupported provider: {config.provider}") 35 | 36 | 37 | def get_available_providers() -> list[str]: 38 | """Get list of available provider names""" 39 | return ["openai", "anthropic", "openrouter"] 40 | 41 | 42 | def validate_provider_config(provider: str, config: DocPixieConfig) -> bool: 43 | """ 44 | Validate provider configuration 45 | 46 | Args: 47 | provider: Provider name 48 | config: Configuration to validate 49 | 50 | Returns: 51 | True if configuration is valid 52 | 53 | Raises: 54 | ValueError: If configuration is invalid 55 | """ 56 | if provider not in get_available_providers(): 57 | raise ValueError(f"Unknown provider: {provider}") 58 | 59 | if provider == "openai": 60 | if not config.openai_api_key: 61 | raise ValueError("OpenAI API key is required") 62 | if not config.vision_model: 63 | raise ValueError("Vision model is required") 64 | return True 65 | 66 | elif provider == "anthropic": 67 | if not config.anthropic_api_key: 68 | raise ValueError("Anthropic API key is required") 69 | if not config.vision_model: 70 | raise ValueError("Vision model is required") 71 | return True 72 | 73 | elif provider == "openrouter": 74 | if not config.openrouter_api_key: 75 | raise ValueError("OpenRouter API key is required") 76 | if not config.vision_model: 77 | raise ValueError("Vision model is required") 78 | return True 79 | 80 | return False -------------------------------------------------------------------------------- /docpixie/ai/query_classifier.py: -------------------------------------------------------------------------------- 1 | """ 2 | Query Classifier - Determines if queries need document retrieval 3 | """ 4 | 5 | import json 6 | import logging 7 | 8 | from ..providers.base import BaseProvider 9 | from ..exceptions import QueryClassificationError 10 | from ..core.utils import sanitize_llm_json 11 | from .prompts import QUERY_CLASSIFICATION_PROMPT, SYSTEM_QUERY_CLASSIFIER 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | class QueryClassifier: 17 | """ 18 | Classifies queries to determine processing strategy 19 | 20 | Key classification: 21 | - needs_documents: Whether query requires document retrieval 22 | """ 23 | 24 | def __init__(self, provider: BaseProvider): 25 | self.provider = provider 26 | 27 | async def classify_query(self, query: str) -> dict: 28 | """ 29 | Classify a query to determine processing approach 30 | 31 | Args: 32 | query: The user's query (potentially reformulated) 33 | 34 | Returns: 35 | Dict with classification results: 36 | { 37 | "reasoning": "explanation", 38 | "needs_documents": bool 39 | } 40 | 41 | Raises: 42 | QueryClassificationError: If classification fails 43 | """ 44 | result = None 45 | 46 | try: 47 | # Build classification prompt 48 | prompt = QUERY_CLASSIFICATION_PROMPT.format(query=query) 49 | 50 | messages_for_api = [ 51 | {"role": "system", "content": SYSTEM_QUERY_CLASSIFIER}, 52 | {"role": "user", "content": prompt} 53 | ] 54 | 55 | response = await self.provider.process_text_messages( 56 | messages=messages_for_api, 57 | max_tokens=1024, 58 | temperature=0.1 59 | ) 60 | 61 | # Parse JSON response 62 | try: 63 | result = json.loads(sanitize_llm_json(response)) 64 | 65 | # Validate required fields 66 | if "reasoning" not in result or "needs_documents" not in result: 67 | raise QueryClassificationError( 68 | f"Missing required fields in classification response: {result}" 69 | ) 70 | 71 | logger.info(f"Query classified: needs_documents={result['needs_documents']}, " 72 | f"reasoning='{result['reasoning']}'") 73 | 74 | return result 75 | 76 | except json.JSONDecodeError as e: 77 | logger.error(f"Failed to parse classification JSON: {response}") 78 | raise QueryClassificationError(f"Invalid JSON response from classification: {e}") 79 | 80 | except Exception as e: 81 | logger.error(f"Query classification failed: {e}") 82 | raise QueryClassificationError(f"Failed to classify query: {e}") 83 | -------------------------------------------------------------------------------- /docpixie/cli/styles.py: -------------------------------------------------------------------------------- 1 | """ 2 | CSS styles for DocPixie CLI components 3 | """ 4 | 5 | SETUP_SCREEN_CSS = """ 6 | SetupScreen { 7 | align: center middle; 8 | } 9 | 10 | #setup-container { 11 | width: 60; 12 | height: auto; 13 | padding: 1 2; 14 | background: #2d1f2d; 15 | border: solid #ff99cc; 16 | } 17 | 18 | #setup-container > .title { 19 | color: #ff99cc; 20 | } 21 | 22 | #setup-hint, .setup-text { 23 | color: #bda6b6; 24 | } 25 | 26 | #api-input { 27 | margin: 1 0; 28 | background: #2d1f2d; 29 | border: solid #ff99cc; 30 | } 31 | """ 32 | 33 | MAIN_APP_CSS = """ 34 | #chat-container { 35 | height: 100%; 36 | layout: vertical; 37 | background: #2d1f2d; 38 | padding: 0 1 1 1; 39 | } 40 | 41 | #chat-log { 42 | border: solid #4a3344; 43 | background: #2d1f2d; 44 | } 45 | 46 | #input-container { 47 | height: auto; 48 | min-height: 3; 49 | max-height: 12; 50 | padding: 0 0 0 1; 51 | margin: 0; 52 | background: #2d1f2d; 53 | border: solid #ff99cc; 54 | } 55 | 56 | #prompt-indicator { 57 | width: 2; 58 | color: #ff99cc; 59 | padding: 0; 60 | background: #2d1f2d; 61 | margin: 0; 62 | } 63 | 64 | #chat-input { 65 | background: #2d1f2d; 66 | min-height: 1; 67 | max-height: 10; 68 | height: auto; 69 | border: none; 70 | padding: 0; 71 | margin: 0; 72 | scrollbar-background: #2d1f2d; 73 | scrollbar-color: #ff99cc; 74 | scrollbar-size: 1 1; 75 | } 76 | 77 | #chat-input:focus { 78 | border: none; 79 | } 80 | 81 | #chat-input > .text-area--scrollbar { 82 | background: #2d1f2d; 83 | } 84 | 85 | #chat-input > ScrollableContainer { 86 | background: #2d1f2d; 87 | } 88 | 89 | ChatInput { 90 | background: #2d1f2d !important; 91 | } 92 | 93 | ChatInput > .text-area--scrollbar { 94 | background: #2d1f2d; 95 | } 96 | 97 | ChatInput .text-area--cursor-line { 98 | background: #2d1f2d; 99 | } 100 | 101 | #chat-input .text-area--document { 102 | background: #2d1f2d; 103 | } 104 | 105 | #chat-input .text-area--selection { 106 | background: #4a3344; 107 | } 108 | 109 | #chat-input .text-area--cursor { 110 | background: #ff99cc; 111 | } 112 | 113 | #input-hint { 114 | height: 1; 115 | color: #bda6b6; 116 | background: #2d1f2d; 117 | padding: 0 1; 118 | margin: 0; 119 | } 120 | 121 | #status-bar { 122 | height: 1; 123 | background: #2d1f2d; 124 | color: $text; 125 | padding: 0 1; 126 | } 127 | 128 | .user-message { 129 | color: $success; 130 | margin: 0 0 1 0; 131 | } 132 | 133 | .assistant-message { 134 | color: $primary; 135 | margin: 0 0 1 0; 136 | } 137 | 138 | .task-update { 139 | color: $warning; 140 | margin: 0 0 1 0; 141 | } 142 | 143 | .error-message { 144 | color: $error; 145 | margin: 0 0 1 0; 146 | } 147 | """ 148 | -------------------------------------------------------------------------------- /docpixie/processors/base.py: -------------------------------------------------------------------------------- 1 | """ 2 | Base processor interface for document processing 3 | """ 4 | 5 | from abc import ABC, abstractmethod 6 | from typing import List, Optional 7 | from pathlib import Path 8 | import logging 9 | 10 | from ..models.document import Document, Page 11 | from ..core.config import DocPixieConfig 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | class BaseProcessor(ABC): 17 | """Base class for document processors""" 18 | 19 | def __init__(self, config: DocPixieConfig): 20 | self.config = config 21 | 22 | @abstractmethod 23 | def supports(self, file_path: str) -> bool: 24 | """Check if this processor supports the given file type""" 25 | pass 26 | 27 | @abstractmethod 28 | async def process(self, file_path: str, document_id: Optional[str] = None) -> Document: 29 | """ 30 | Process a document file into pages 31 | 32 | Args: 33 | file_path: Path to the document file 34 | document_id: Optional custom document ID 35 | 36 | Returns: 37 | Document with processed pages 38 | """ 39 | pass 40 | 41 | def get_supported_extensions(self) -> List[str]: 42 | """Get list of supported file extensions""" 43 | return [] 44 | 45 | def _create_document( 46 | self, 47 | file_path: str, 48 | pages: List[Page], 49 | document_id: Optional[str] = None 50 | ) -> Document: 51 | """Create a Document object from processed pages""" 52 | document_name = Path(file_path).stem 53 | 54 | return Document( 55 | id=document_id or self._generate_document_id(file_path), 56 | name=document_name, 57 | pages=pages, 58 | metadata={ 59 | 'original_file': file_path, 60 | 'processor': self.__class__.__name__, 61 | 'file_size': Path(file_path).stat().st_size if Path(file_path).exists() else 0 62 | } 63 | ) 64 | 65 | def _generate_document_id(self, file_path: str) -> str: 66 | """Generate a document ID from file path""" 67 | import hashlib 68 | return hashlib.md5(file_path.encode()).hexdigest() 69 | 70 | def _validate_file(self, file_path: str) -> None: 71 | """Validate that file exists and is readable""" 72 | path = Path(file_path) 73 | if not path.exists(): 74 | raise FileNotFoundError(f"File not found: {file_path}") 75 | if not path.is_file(): 76 | raise ValueError(f"Path is not a file: {file_path}") 77 | if path.stat().st_size == 0: 78 | raise ValueError(f"File is empty: {file_path}") 79 | 80 | 81 | class ProcessingError(Exception): 82 | """Exception raised during document processing""" 83 | 84 | def __init__(self, message: str, file_path: str, page_number: Optional[int] = None): 85 | self.file_path = file_path 86 | self.page_number = page_number 87 | super().__init__(message) -------------------------------------------------------------------------------- /docpixie/ai/query_reformulator.py: -------------------------------------------------------------------------------- 1 | """ 2 | Query Reformulator - Creates optimized search queries from conversation context 3 | """ 4 | 5 | import json 6 | import logging 7 | 8 | from ..providers.base import BaseProvider 9 | from ..exceptions import QueryReformulationError 10 | from ..core.utils import sanitize_llm_json 11 | from .prompts import QUERY_REFORMULATION_PROMPT, SYSTEM_QUERY_REFORMULATOR 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | class QueryReformulator: 17 | """ 18 | Reformulates queries by resolving references for better search 19 | 20 | Focuses on: 21 | - Resolving pronouns and references (e.g., "it", "this", "that") 22 | - Keeping queries concise and focused on current intent 23 | - NOT combining multiple questions or intents 24 | - Maintaining optimal length for search 25 | """ 26 | 27 | def __init__(self, provider: BaseProvider): 28 | self.provider = provider 29 | 30 | async def reformulate_with_context( 31 | self, 32 | current_query: str, 33 | conversation_context: str 34 | ) -> str: 35 | """ 36 | Reformulate query by resolving references while keeping it concise 37 | 38 | Args: 39 | current_query: The current user query 40 | conversation_context: Processed context from ContextProcessor 41 | 42 | Returns: 43 | Reformulated query with resolved references 44 | 45 | Raises: 46 | QueryReformulationError: If reformulation fails 47 | """ 48 | try: 49 | # Build prompt using existing template 50 | prompt = QUERY_REFORMULATION_PROMPT.format( 51 | conversation_context=conversation_context, 52 | recent_topics="", # Let AI extract topics from context 53 | current_query=current_query 54 | ) 55 | 56 | messages_for_api = [ 57 | {"role": "system", "content": SYSTEM_QUERY_REFORMULATOR}, 58 | {"role": "user", "content": prompt} 59 | ] 60 | 61 | response = await self.provider.process_text_messages( 62 | messages=messages_for_api, 63 | max_tokens=8192, 64 | temperature=0.2 65 | ) 66 | 67 | # Parse JSON response 68 | result = None 69 | try: 70 | result = json.loads(sanitize_llm_json(response)) 71 | reformulated = result.get("reformulated_query", current_query) 72 | 73 | logger.info(f"Query reformulation: '{current_query}' → '{reformulated}'") 74 | return reformulated 75 | 76 | except json.JSONDecodeError as e: 77 | logger.error(f"Failed to parse reformulation JSON: {response}") 78 | raise QueryReformulationError(f"Invalid JSON response from reformulation: {e}") 79 | 80 | except Exception as e: 81 | logger.error(f"Query reformulation failed: {e}") 82 | raise QueryReformulationError(f"Failed to reformulate query: {e}") 83 | -------------------------------------------------------------------------------- /docpixie/providers/base.py: -------------------------------------------------------------------------------- 1 | """ 2 | Base provider interface for vision AI operations 3 | """ 4 | 5 | import base64 6 | from abc import ABC, abstractmethod 7 | from typing import List, Optional 8 | from pathlib import Path 9 | from dataclasses import dataclass 10 | import logging 11 | 12 | from ..core.config import DocPixieConfig 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | @dataclass 18 | class APIResult: 19 | """Container for API response with optional cost tracking""" 20 | text: str 21 | cost: Optional[float] = None 22 | 23 | 24 | class BaseProvider(ABC): 25 | """Base class for AI vision providers""" 26 | 27 | def __init__(self, config: DocPixieConfig): 28 | self.config = config 29 | self.last_api_cost: Optional[float] = None 30 | self.total_cost: float = 0.0 31 | 32 | @abstractmethod 33 | async def process_text_messages( 34 | self, 35 | messages: List[dict], 36 | max_tokens: int = 512, 37 | temperature: float = 0.3 38 | ) -> str: 39 | """Process text-only messages through the provider API""" 40 | pass 41 | 42 | @abstractmethod 43 | async def process_multimodal_messages( 44 | self, 45 | messages: List[dict], 46 | max_tokens: int = 300, 47 | temperature: float = 0.3 48 | ) -> str: 49 | """Process messages with text and images through the provider API""" 50 | pass 51 | 52 | def get_last_cost(self) -> Optional[float]: 53 | """Get the cost of the last API call (if available)""" 54 | return self.last_api_cost 55 | 56 | def get_total_cost(self) -> float: 57 | """Get the total accumulated cost""" 58 | return self.total_cost 59 | 60 | def reset_cost_tracking(self): 61 | """Reset cost tracking""" 62 | self.last_api_cost = None 63 | self.total_cost = 0.0 64 | 65 | # Helper methods for image handling (shared by all providers) 66 | 67 | def _encode_image(self, image_path: str) -> str: 68 | """Encode image to base64 for API calls""" 69 | try: 70 | with open(image_path, 'rb') as image_file: 71 | encoded_string = base64.b64encode(image_file.read()).decode('utf-8') 72 | return encoded_string 73 | except Exception as e: 74 | logger.error(f"Failed to encode image {image_path}: {e}") 75 | raise 76 | 77 | def _create_image_data_url(self, image_path: str) -> str: 78 | """Create data URL for image""" 79 | encoded_image = self._encode_image(image_path) 80 | return f"data:image/jpeg;base64,{encoded_image}" 81 | 82 | def _validate_image_path(self, image_path: str) -> bool: 83 | """Validate image path exists and is readable""" 84 | path = Path(image_path) 85 | return path.exists() and path.is_file() 86 | 87 | 88 | class ProviderError(Exception): 89 | """Exception raised by provider operations""" 90 | 91 | def __init__(self, message: str, provider: str, image_path: str = None): 92 | self.provider = provider 93 | self.image_path = image_path 94 | super().__init__(message) 95 | -------------------------------------------------------------------------------- /docpixie/cli/task_display.py: -------------------------------------------------------------------------------- 1 | """ 2 | Task display management for DocPixie CLI 3 | """ 4 | 5 | from typing import TYPE_CHECKING, Any 6 | from .state_manager import AppStateManager 7 | from .widgets import ChatArea 8 | 9 | if TYPE_CHECKING: 10 | from .app import DocPixieTUI 11 | 12 | 13 | class TaskDisplayManager: 14 | """Manages task plan and progress display in the chat interface""" 15 | 16 | def __init__(self, app: 'DocPixieTUI', state_manager: AppStateManager): 17 | self.app = app 18 | self.state_manager = state_manager 19 | 20 | def display_task_update(self, event_type: str, data: Any) -> None: 21 | """Display task plan updates""" 22 | chat_log = self.app.query_one("#chat-log", ChatArea) 23 | 24 | if event_type == 'plan_created': 25 | plan = data 26 | self.state_manager.current_plan = plan 27 | self.state_manager.completed_tasks.clear() 28 | chat_log.hide_processing_status(mark_done=True, final_text="Planning") 29 | chat_log.show_plan(plan) 30 | 31 | elif event_type == 'plan_updated': 32 | plan = data 33 | self.state_manager.current_plan = plan 34 | chat_log.show_plan(plan, is_update=True, completed_tasks=list(self.state_manager.completed_tasks)) 35 | 36 | elif event_type == 'task_started': 37 | task = data['task'] 38 | task_name = task.name if hasattr(task, 'name') else str(task) 39 | 40 | doc_name = self._get_document_name_for_task(task) 41 | chat_log.show_task_progress(task_name, None, doc_name) 42 | 43 | elif event_type == 'pages_selected': 44 | task = data['task'] 45 | page_numbers = data.get('page_numbers', []) 46 | task_name = task.name if hasattr(task, 'name') else str(task) 47 | 48 | doc_name = self._get_document_name_for_task(task) 49 | pages_count = len(page_numbers) if isinstance(page_numbers, (list, tuple)) else 0 50 | chat_log.show_task_progress(task_name, pages_count, doc_name) 51 | 52 | elif event_type == 'task_completed': 53 | task = data['task'] 54 | task_name = task.name if hasattr(task, 'name') else str(task) 55 | 56 | chat_log.update_task_status(task_name, done=True) 57 | self.state_manager.completed_tasks.add(task_name) 58 | 59 | if self.state_manager.current_plan: 60 | chat_log.show_plan( 61 | self.state_manager.current_plan, 62 | is_update=True, 63 | completed_tasks=list(self.state_manager.completed_tasks) 64 | ) 65 | 66 | def _get_document_name_for_task(self, task) -> str: 67 | """Extract document name from task, with fallback to 'document'""" 68 | doc_name = 'document' 69 | try: 70 | task_doc_id = getattr(task, 'document', '') 71 | if task_doc_id: 72 | doc = next( 73 | (d for d in self.state_manager.indexed_documents if d.id == task_doc_id), 74 | None 75 | ) 76 | if doc and getattr(doc, 'name', None): 77 | doc_name = doc.name 78 | except Exception: 79 | pass 80 | return doc_name -------------------------------------------------------------------------------- /docpixie/processors/factory.py: -------------------------------------------------------------------------------- 1 | """ 2 | Processor factory for selecting appropriate document processor 3 | """ 4 | 5 | from typing import Optional, Dict, Type 6 | from pathlib import Path 7 | import logging 8 | 9 | from .base import BaseProcessor 10 | from .pdf import PDFProcessor 11 | from .image import ImageProcessor 12 | from ..core.config import DocPixieConfig 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | class ProcessorFactory: 18 | """Factory for creating document processors""" 19 | 20 | def __init__(self, config: DocPixieConfig): 21 | self.config = config 22 | self._processors: Dict[str, Type[BaseProcessor]] = { 23 | 'pdf': PDFProcessor, 24 | 'image': ImageProcessor 25 | } 26 | 27 | # Map file extensions to processor types 28 | self._extension_map: Dict[str, str] = {} 29 | self._build_extension_map() 30 | 31 | def _build_extension_map(self): 32 | """Build mapping from file extensions to processor types""" 33 | # Create processor instances to get supported extensions 34 | for processor_type, processor_class in self._processors.items(): 35 | processor = processor_class(self.config) 36 | for ext in processor.get_supported_extensions(): 37 | self._extension_map[ext.lower()] = processor_type 38 | 39 | logger.debug(f"Built extension map: {self._extension_map}") 40 | 41 | def get_processor(self, file_path: str) -> BaseProcessor: 42 | """ 43 | Get appropriate processor for file 44 | 45 | Args: 46 | file_path: Path to file 47 | 48 | Returns: 49 | Processor instance 50 | 51 | Raises: 52 | ValueError: If file type is not supported 53 | """ 54 | file_extension = Path(file_path).suffix.lower() 55 | 56 | if not file_extension: 57 | raise ValueError(f"File has no extension: {file_path}") 58 | 59 | processor_type = self._extension_map.get(file_extension) 60 | 61 | if not processor_type: 62 | supported_exts = list(self._extension_map.keys()) 63 | raise ValueError( 64 | f"Unsupported file type '{file_extension}'. " 65 | f"Supported extensions: {supported_exts}" 66 | ) 67 | 68 | processor_class = self._processors[processor_type] 69 | processor = processor_class(self.config) 70 | 71 | logger.debug(f"Selected {processor_class.__name__} for {file_path}") 72 | return processor 73 | 74 | def supports_file(self, file_path: str) -> bool: 75 | """Check if file type is supported""" 76 | file_extension = Path(file_path).suffix.lower() 77 | return file_extension in self._extension_map 78 | 79 | def get_supported_extensions(self) -> Dict[str, str]: 80 | """Get all supported extensions and their processor types""" 81 | return self._extension_map.copy() 82 | 83 | def register_processor(self, processor_type: str, processor_class: Type[BaseProcessor]): 84 | """ 85 | Register a custom processor 86 | 87 | Args: 88 | processor_type: Unique identifier for processor 89 | processor_class: Processor class 90 | """ 91 | self._processors[processor_type] = processor_class 92 | 93 | # Update extension mapping 94 | processor = processor_class(self.config) 95 | for ext in processor.get_supported_extensions(): 96 | self._extension_map[ext.lower()] = processor_type 97 | 98 | logger.info(f"Registered custom processor: {processor_type}") 99 | 100 | def list_processors(self) -> Dict[str, Type[BaseProcessor]]: 101 | """Get all registered processors""" 102 | return self._processors.copy() 103 | 104 | def create_processor(self, processor_type: str) -> Optional[BaseProcessor]: 105 | """ 106 | Create processor by type 107 | 108 | Args: 109 | processor_type: Type of processor to create 110 | 111 | Returns: 112 | Processor instance or None if type not found 113 | """ 114 | processor_class = self._processors.get(processor_type) 115 | if processor_class: 116 | return processor_class(self.config) 117 | return None -------------------------------------------------------------------------------- /docpixie/ai/summarizer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Page summarizer for generating document summaries 3 | """ 4 | 5 | import asyncio 6 | from typing import List, Optional, Dict, Any 7 | import logging 8 | 9 | from ..providers.base import BaseProvider 10 | from ..models.document import Document, Page 11 | from ..core.config import DocPixieConfig 12 | from .prompts import SYSTEM_VISION_EXPERT 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | class PageSummarizer: 18 | """Generates summaries for document pages using vision models""" 19 | 20 | def __init__(self, config: DocPixieConfig, provider: Optional[BaseProvider] = None): 21 | self.config = config 22 | if provider: 23 | self.provider = provider 24 | else: 25 | from ..providers.factory import create_provider 26 | self.provider = create_provider(config) 27 | 28 | 29 | 30 | async def summarize_document(self, document: Document) -> Document: 31 | """ 32 | Generate document summary from all page images 33 | 34 | Args: 35 | document: Document to summarize 36 | 37 | Returns: 38 | Document with document summary 39 | """ 40 | logger.info(f"Summarizing document: {document.name}") 41 | 42 | # Always generate document summary from all page images 43 | document_summary = await self._generate_document_summary(document.pages, document.name) 44 | 45 | # Create updated document 46 | updated_document = Document( 47 | id=document.id, 48 | name=document.name, 49 | pages=document.pages, 50 | summary=document_summary, 51 | status=document.status, 52 | metadata={ 53 | **document.metadata, 54 | 'document_summary_generated': document_summary is not None, 55 | 'summary_model': self.config.provider 56 | }, 57 | created_at=document.created_at 58 | ) 59 | 60 | logger.info(f"Completed document summarization: {document.name}") 61 | return updated_document 62 | 63 | async def _generate_document_summary(self, pages: List[Page], document_name: str) -> Optional[str]: 64 | """Generate overall document summary using all page images in a single vision call""" 65 | try: 66 | # Get all page image paths 67 | image_paths = [page.image_path for page in pages if page.image_path] 68 | 69 | if not image_paths: 70 | logger.warning("No page images available for document summary") 71 | return None 72 | 73 | # Build messages for document summary 74 | messages = [ 75 | { 76 | "role": "system", 77 | "content": "You are a document analysis expert. Analyze all pages of this document and create a comprehensive summary that captures the overall content, main themes, key information, and purpose of the entire document. Consider how all pages work together to form a complete document." 78 | }, 79 | { 80 | "role": "user", 81 | "content": [ 82 | { 83 | "type": "text", 84 | "text": f"Please analyze this complete document titled '{document_name}' and provide a comprehensive summary. Look at all pages together to understand the document's overall structure, main themes, key information, and purpose." 85 | } 86 | ] 87 | } 88 | ] 89 | 90 | # Add all page images to the user message 91 | for image_path in image_paths: 92 | messages[1]["content"].append({ 93 | "type": "image_path", 94 | "image_path": image_path, 95 | "detail": self.config.vision_detail 96 | }) 97 | 98 | # Generate document summary using provider 99 | summary = await self.provider.process_multimodal_messages( 100 | messages=messages, 101 | max_tokens=400, 102 | temperature=0.3 103 | ) 104 | 105 | logger.debug(f"Generated document summary: {summary[:50]}...") 106 | return summary 107 | 108 | except Exception as e: 109 | logger.error(f"Failed to generate document summary: {e}") 110 | return None 111 | 112 | 113 | 114 | def get_summary_stats(self) -> Dict[str, Any]: 115 | """Get summarizer statistics""" 116 | return { 117 | 'provider': self.config.provider, 118 | 'model': self.config.vision_model 119 | } 120 | -------------------------------------------------------------------------------- /docpixie/models/agent.py: -------------------------------------------------------------------------------- 1 | """ 2 | Agent models and data structures for DocPixie RAG Agent 3 | """ 4 | 5 | import uuid 6 | from dataclasses import dataclass, field 7 | from typing import List, Optional 8 | from enum import Enum 9 | from datetime import datetime 10 | 11 | from .document import Page 12 | 13 | 14 | class TaskStatus(str, Enum): 15 | """Agent task status""" 16 | PENDING = "pending" 17 | IN_PROGRESS = "in_progress" 18 | COMPLETED = "completed" 19 | CANCELLED = "cancelled" 20 | 21 | 22 | @dataclass 23 | class ConversationMessage: 24 | """Represents a single conversation message""" 25 | role: str # "user" or "assistant" 26 | content: str 27 | timestamp: datetime = field(default_factory=datetime.now) 28 | cost: float = 0.0 # Cost for this message (agent pipeline total for assistant messages) 29 | 30 | def __post_init__(self): 31 | """Validate message data""" 32 | if self.role not in ["system", "user", "assistant"]: 33 | raise ValueError("Role must be 'user' or 'assistant'") 34 | if not self.content.strip(): 35 | raise ValueError("Content cannot be empty") 36 | 37 | 38 | @dataclass 39 | class AgentTask: 40 | """Represents a single task in the agent's plan""" 41 | id: str = field(default_factory=lambda: str(uuid.uuid4())) 42 | name: str = "" 43 | description: str = "" 44 | status: TaskStatus = TaskStatus.PENDING 45 | document: str = "" # Single document ID assigned to this task 46 | 47 | def __post_init__(self): 48 | """Validate task data""" 49 | if not self.name.strip(): 50 | raise ValueError("Task name cannot be empty") 51 | if not self.description.strip(): 52 | raise ValueError("Task description cannot be empty") 53 | 54 | 55 | @dataclass 56 | class TaskPlan: 57 | """Represents the agent's current task plan""" 58 | initial_query: str 59 | tasks: List[AgentTask] = field(default_factory=list) 60 | current_iteration: int = 0 61 | 62 | def get_next_pending_task(self) -> Optional[AgentTask]: 63 | """Get the next task that needs to be executed""" 64 | return next((task for task in self.tasks if task.status == TaskStatus.PENDING), None) 65 | 66 | def has_pending_tasks(self) -> bool: 67 | """Check if there are any pending tasks""" 68 | return any(task.status == TaskStatus.PENDING for task in self.tasks) 69 | 70 | def mark_task_completed(self, task_id: str) -> bool: 71 | """Mark a task as completed""" 72 | task = next((t for t in self.tasks if t.id == task_id), None) 73 | if task: 74 | task.status = TaskStatus.COMPLETED 75 | return True 76 | return False 77 | 78 | def add_task(self, task: AgentTask): 79 | """Add a new task to the plan""" 80 | self.tasks.append(task) 81 | 82 | def remove_task(self, task_id: str) -> bool: 83 | """Remove a task from the plan""" 84 | original_length = len(self.tasks) 85 | self.tasks = [t for t in self.tasks if t.id != task_id] 86 | return len(self.tasks) < original_length 87 | 88 | def get_completed_tasks(self) -> List[AgentTask]: 89 | """Get all completed tasks""" 90 | return [task for task in self.tasks if task.status == TaskStatus.COMPLETED] 91 | 92 | 93 | @dataclass 94 | class TaskResult: 95 | """Represents the result of executing a single task""" 96 | task: AgentTask 97 | selected_pages: List[Page] 98 | analysis: str 99 | pages_analyzed: int = 0 100 | 101 | def __post_init__(self): 102 | """Calculate pages analyzed""" 103 | self.pages_analyzed = len(self.selected_pages) 104 | 105 | 106 | @dataclass 107 | class AgentQueryResult: 108 | """Represents the final result of processing a user query through the agent pipeline""" 109 | query: str 110 | answer: str 111 | selected_pages: List[Page] 112 | task_results: List[TaskResult] = field(default_factory=list) 113 | total_iterations: int = 0 114 | processing_time_seconds: float = 0.0 115 | total_cost: float = 0.0 # Total cost of all API calls for this query 116 | 117 | def get_unique_pages(self) -> List[Page]: 118 | """Get unique pages from all task results""" 119 | seen_paths = set() 120 | unique_pages = [] 121 | 122 | for page in self.selected_pages: 123 | if page.image_path not in seen_paths: 124 | seen_paths.add(page.image_path) 125 | unique_pages.append(page) 126 | 127 | return unique_pages 128 | 129 | def get_total_pages_analyzed(self) -> int: 130 | """Get total number of pages analyzed across all tasks""" 131 | return sum(result.pages_analyzed for result in self.task_results) 132 | -------------------------------------------------------------------------------- /docpixie/cli/commands.py: -------------------------------------------------------------------------------- 1 | """ 2 | Command handling for DocPixie CLI 3 | """ 4 | 5 | from typing import TYPE_CHECKING, Optional 6 | from pathlib import Path 7 | from docpixie import DocPixie 8 | from .state_manager import AppStateManager 9 | from .widgets import ( 10 | ConversationManagerDialog, ModelSelectorDialog, DocumentManagerDialog, 11 | ChatArea 12 | ) 13 | 14 | if TYPE_CHECKING: 15 | from .app import DocPixieTUI 16 | 17 | 18 | class CommandHandler: 19 | """Handles all slash commands for the CLI application""" 20 | 21 | def __init__(self, app: 'DocPixieTUI', state_manager: AppStateManager): 22 | self.app = app 23 | self.state_manager = state_manager 24 | 25 | async def handle_command(self, command: str) -> None: 26 | """Handle slash commands""" 27 | chat_log = self.app.query_one("#chat-log", ChatArea) 28 | 29 | if command == "/exit": 30 | self.state_manager.save_current_conversation() 31 | self.app.exit() 32 | 33 | elif command == "/new": 34 | await self._handle_new_command(chat_log) 35 | 36 | elif command == "/clear": 37 | self._handle_clear_command(chat_log) 38 | 39 | elif command == "/save": 40 | self._handle_save_command(chat_log) 41 | 42 | elif command == "/conversations": 43 | await self._handle_conversations_command() 44 | 45 | elif command == "/model": 46 | await self._handle_model_command() 47 | 48 | elif command == "/documents": 49 | await self._handle_documents_command() 50 | 51 | elif command == "/help": 52 | self._handle_help_command(chat_log) 53 | 54 | else: 55 | chat_log.write(f"[warning]Unknown command: {command}[/warning]\n") 56 | chat_log.write("Type /help for available commands\n\n") 57 | 58 | async def _handle_new_command(self, chat_log: ChatArea) -> None: 59 | """Handle /new command""" 60 | self.state_manager.save_current_conversation() 61 | self.state_manager.create_new_conversation() 62 | self.state_manager.clear_task_plan() 63 | 64 | chat_log.clear() 65 | self.app.show_welcome_message() 66 | chat_log.write("[green bold]●[/green bold] Started new conversation\n\n") 67 | 68 | status_label = self.app.query_one("#status-label") 69 | status_label.update(self.state_manager.get_status_text()) 70 | 71 | def _handle_clear_command(self, chat_log: ChatArea) -> None: 72 | """Handle /clear command""" 73 | self.state_manager.clear_task_plan() 74 | chat_log.clear() 75 | self.app.show_welcome_message() 76 | 77 | def _handle_save_command(self, chat_log: ChatArea) -> None: 78 | """Handle /save command""" 79 | if self.state_manager.current_conversation_id and self.state_manager.conversation_history: 80 | self.state_manager.save_current_conversation() 81 | chat_log.write("[green bold]●[/green bold] Conversation saved!\n\n") 82 | else: 83 | chat_log.write("[warning]No conversation to save[/warning]\n\n") 84 | 85 | async def _handle_conversations_command(self) -> None: 86 | """Handle /conversations command""" 87 | await self.app.push_screen(ConversationManagerDialog( 88 | self.state_manager.current_conversation_id 89 | )) 90 | 91 | async def _handle_model_command(self) -> None: 92 | """Handle /model command""" 93 | await self.app.push_screen(ModelSelectorDialog()) 94 | 95 | async def _handle_documents_command(self) -> None: 96 | """Handle /documents command""" 97 | await self.app.push_screen(DocumentManagerDialog( 98 | self.state_manager.documents_folder, 99 | self.app.docpixie 100 | )) 101 | 102 | def _handle_help_command(self, chat_log: ChatArea) -> None: 103 | """Handle /help command""" 104 | chat_log.write("\n[bold]Available Commands:[/bold]\n") 105 | chat_log.write(" /new - Start a new conversation (Ctrl+N)\n") 106 | chat_log.write(" /conversations - Switch between conversations (Ctrl+L)\n") 107 | chat_log.write(" /save - Save current conversation\n") 108 | chat_log.write(" /clear - Clear the chat display\n") 109 | chat_log.write(" /model - Configure AI models (Ctrl+O)\n") 110 | chat_log.write(" /documents - Manage and index documents (Ctrl+D)\n") 111 | chat_log.write(" /help - Show this help message\n") 112 | chat_log.write(" /exit - Exit the program (Ctrl+Q)\n\n") 113 | chat_log.write("[dim]Press Ctrl+/ to open command palette[/dim]\n\n") -------------------------------------------------------------------------------- /docpixie/providers/openai.py: -------------------------------------------------------------------------------- 1 | """ 2 | OpenAI GPT-4V provider for raw API operations 3 | """ 4 | 5 | import logging 6 | from typing import List, Dict, Any 7 | 8 | from .base import BaseProvider, ProviderError 9 | from ..core.config import DocPixieConfig 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | class OpenAIProvider(BaseProvider): 15 | """OpenAI GPT-4V provider for raw API operations""" 16 | 17 | def __init__(self, config: DocPixieConfig): 18 | super().__init__(config) 19 | 20 | if not config.openai_api_key: 21 | raise ValueError("OpenAI API key is required") 22 | 23 | # Import here to make it optional dependency 24 | try: 25 | from openai import AsyncOpenAI 26 | self.client = AsyncOpenAI(api_key=config.openai_api_key) 27 | except ImportError: 28 | raise ImportError("OpenAI library not found. Install with: pip install openai") 29 | 30 | self.model = config.vision_model 31 | 32 | async def process_text_messages( 33 | self, 34 | messages: List[Dict[str, Any]], 35 | max_tokens: int = 300, 36 | temperature: float = 0.3 37 | ) -> str: 38 | """Process text-only messages through OpenAI API""" 39 | try: 40 | response = await self.client.chat.completions.create( 41 | model=self.config.model, 42 | messages=messages, 43 | max_tokens=max_tokens, 44 | temperature=temperature 45 | ) 46 | 47 | result = response.choices[0].message.content.strip() 48 | logger.debug(f"OpenAI text response: {result[:50]}...") 49 | 50 | return result 51 | 52 | except Exception as e: 53 | logger.error(f"OpenAI text processing failed: {e}") 54 | raise ProviderError(f"Text processing failed: {e}", "openai") 55 | 56 | async def process_multimodal_messages( 57 | self, 58 | messages: List[Dict[str, Any]], 59 | max_tokens: int = 300, 60 | temperature: float = 0.3 61 | ) -> str: 62 | """Process multimodal messages (text + images) through OpenAI Vision API""" 63 | try: 64 | # Process messages to convert image paths to data URLs 65 | processed_messages = self._prepare_openai_messages(messages) 66 | 67 | response = await self.client.chat.completions.create( 68 | model=self.model, # Use vision model 69 | messages=processed_messages, 70 | max_tokens=max_tokens, 71 | temperature=temperature 72 | ) 73 | 74 | result = response.choices[0].message.content.strip() 75 | logger.debug(f"OpenAI multimodal response: {result[:50]}...") 76 | 77 | return result 78 | 79 | except Exception as e: 80 | logger.error(f"OpenAI multimodal processing failed: {e}") 81 | raise ProviderError(f"Multimodal processing failed: {e}", "openai") 82 | 83 | def _prepare_openai_messages(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]: 84 | """Prepare messages for OpenAI API by converting image paths to data URLs""" 85 | processed_messages = [] 86 | 87 | for message in messages: 88 | if message["role"] == "system": 89 | # System messages are text-only 90 | processed_messages.append(message) 91 | elif message["role"] == "user" and isinstance(message["content"], list): 92 | # User message with multimodal content 93 | processed_content = [] 94 | 95 | for content_item in message["content"]: 96 | if content_item["type"] == "text": 97 | processed_content.append(content_item) 98 | elif content_item["type"] == "image_path": 99 | # Convert image path to OpenAI format 100 | image_path = content_item["image_path"] 101 | if self._validate_image_path(image_path): 102 | image_data_url = self._create_image_data_url(image_path) 103 | processed_content.append({ 104 | "type": "image_url", 105 | "image_url": { 106 | "url": image_data_url, 107 | "detail": content_item.get("detail", "high") 108 | } 109 | }) 110 | else: 111 | logger.warning(f"Skipping invalid image path: {image_path}") 112 | else: 113 | # Pass through other content types 114 | processed_content.append(content_item) 115 | 116 | processed_messages.append({ 117 | "role": message["role"], 118 | "content": processed_content 119 | }) 120 | else: 121 | # Regular text message 122 | processed_messages.append(message) 123 | 124 | return processed_messages -------------------------------------------------------------------------------- /docpixie/cli/config.py: -------------------------------------------------------------------------------- 1 | """ 2 | Global configuration manager for DocPixie CLI 3 | Handles API keys, model preferences, and user settings 4 | """ 5 | 6 | import json 7 | import os 8 | from pathlib import Path 9 | from typing import Optional, Dict, Any 10 | from dataclasses import dataclass, asdict, field 11 | 12 | 13 | PLANNING_MODELS = [ 14 | "anthropic/claude-opus-4.1", 15 | "anthropic/claude-sonnet-4", 16 | "anthropic/claude-3.5-haiku", 17 | "google/gemini-2.5-flash", 18 | "google/gemini-2.5-pro", 19 | "openai/gpt-5", 20 | "openai/gpt-5-mini", 21 | "openai/gpt-4.1", 22 | "openai/gpt-4.1-mini", 23 | "qwen/qwen-max", 24 | "qwen/qwen-plus", 25 | "nousresearch/hermes-4-70b", 26 | "deepseek/deepseek-chat-v3.1", 27 | "mistralai/mistral-medium-3.1", 28 | ] 29 | 30 | VISION_MODELS = [ 31 | "google/gemini-2.5-pro", 32 | "google/gemini-2.5-flash", 33 | "google/gemini-2.5-flash-lite", 34 | "openai/gpt-4.1", 35 | "openai/gpt-4.1-mini", 36 | "openai/gpt-4.1-nano", 37 | "anthropic/claude-sonnet-4", 38 | ] 39 | 40 | 41 | @dataclass 42 | class CLIConfig: 43 | """CLI configuration stored globally in ~/.docpixie/""" 44 | 45 | openrouter_api_key: Optional[str] = None 46 | 47 | text_model: str = "qwen/qwen-plus" 48 | vision_model: str = "google/gemini-2.5-flash" 49 | 50 | last_conversation_id: Optional[str] = None 51 | theme: str = "default" 52 | 53 | auto_index_on_startup: bool = True 54 | max_conversation_history: int = 20 55 | 56 | def to_dict(self) -> Dict[str, Any]: 57 | """Convert config to dictionary for JSON serialization""" 58 | return asdict(self) 59 | 60 | @classmethod 61 | def from_dict(cls, data: Dict[str, Any]) -> 'CLIConfig': 62 | """Create config from dictionary""" 63 | return cls(**data) 64 | 65 | 66 | class ConfigManager: 67 | """Manages global DocPixie CLI configuration""" 68 | 69 | def __init__(self): 70 | """Initialize config manager with global config directory""" 71 | self.config_dir = Path.home() / ".docpixie" 72 | self.config_file = self.config_dir / "config.json" 73 | self.conversations_dir = self.config_dir / "conversations" 74 | 75 | self.config_dir.mkdir(exist_ok=True) 76 | self.conversations_dir.mkdir(exist_ok=True) 77 | 78 | self.config = self.load_config() 79 | 80 | def load_config(self) -> CLIConfig: 81 | """Load configuration from file or create default""" 82 | if self.config_file.exists(): 83 | try: 84 | with open(self.config_file, 'r') as f: 85 | data = json.load(f) 86 | return CLIConfig.from_dict(data) 87 | except Exception as e: 88 | print(f"Warning: Failed to load config: {e}") 89 | return CLIConfig() 90 | else: 91 | env_key = os.getenv("OPENROUTER_API_KEY") 92 | config = CLIConfig() 93 | if env_key: 94 | config.openrouter_api_key = env_key 95 | return config 96 | 97 | def save_config(self): 98 | """Save current configuration to file""" 99 | try: 100 | with open(self.config_file, 'w') as f: 101 | json.dump(self.config.to_dict(), f, indent=2) 102 | except Exception as e: 103 | print(f"Error saving config: {e}") 104 | 105 | def get_api_key(self) -> Optional[str]: 106 | """Get OpenRouter API key from config or environment""" 107 | if self.config.openrouter_api_key: 108 | return self.config.openrouter_api_key 109 | return os.getenv("OPENROUTER_API_KEY") 110 | 111 | def set_api_key(self, api_key: str): 112 | """Set and save OpenRouter API key""" 113 | self.config.openrouter_api_key = api_key 114 | self.save_config() 115 | 116 | def has_api_key(self) -> bool: 117 | """Check if API key is configured""" 118 | return bool(self.get_api_key()) 119 | 120 | def get_models(self) -> tuple[str, str]: 121 | """Get configured models (text, vision)""" 122 | return self.config.text_model, self.config.vision_model 123 | 124 | def set_models(self, text_model: str = None, vision_model: str = None): 125 | """Update model configuration""" 126 | if text_model: 127 | self.config.text_model = text_model 128 | if vision_model: 129 | self.config.vision_model = vision_model 130 | self.save_config() 131 | 132 | def get_conversation_path(self, conversation_id: str) -> Path: 133 | """Get path for a specific conversation file""" 134 | return self.conversations_dir / f"{conversation_id}.json" 135 | 136 | def get_all_conversations(self) -> list[Path]: 137 | """Get all conversation files""" 138 | return list(self.conversations_dir.glob("*.json")) 139 | 140 | def validate_api_key(self, api_key: str) -> bool: 141 | """ 142 | Validate API key by making a test request 143 | Returns True if valid, False otherwise 144 | """ 145 | try: 146 | if api_key and len(api_key) > 10: 147 | return True 148 | return False 149 | except Exception: 150 | return False 151 | 152 | 153 | _config_manager = None 154 | 155 | 156 | def get_config_manager() -> ConfigManager: 157 | """Get or create the global config manager instance""" 158 | global _config_manager 159 | if _config_manager is None: 160 | _config_manager = ConfigManager() 161 | return _config_manager 162 | -------------------------------------------------------------------------------- /docpixie/models/document.py: -------------------------------------------------------------------------------- 1 | """ 2 | Document models and data structures for DocPixie 3 | Simplified version of schemas from production DocPixie 4 | """ 5 | 6 | from dataclasses import dataclass, field 7 | from typing import List, Dict, Any, Optional 8 | from enum import Enum 9 | from pathlib import Path 10 | import uuid 11 | from datetime import datetime 12 | 13 | 14 | class QueryMode(str, Enum): 15 | """Query processing modes""" 16 | AUTO = "auto" # Standard adaptive processing 17 | 18 | 19 | class DocumentStatus(str, Enum): 20 | """Document processing status""" 21 | PENDING = "pending" 22 | PROCESSING = "processing" 23 | COMPLETED = "completed" 24 | FAILED = "failed" 25 | 26 | 27 | @dataclass 28 | class Page: 29 | """Represents a single document page""" 30 | page_number: int 31 | image_path: str 32 | metadata: Dict[str, Any] = field(default_factory=dict) 33 | document_name: Optional[str] = None 34 | document_id: Optional[str] = None 35 | 36 | def __post_init__(self): 37 | """Validate page data""" 38 | if self.page_number <= 0: 39 | raise ValueError("Page number must be positive") 40 | if not self.image_path: 41 | raise ValueError("Image path is required") 42 | 43 | 44 | @dataclass 45 | class Document: 46 | """Represents a processed document with pages""" 47 | id: str 48 | name: str 49 | pages: List[Page] 50 | summary: Optional[str] = None 51 | status: DocumentStatus = DocumentStatus.PENDING 52 | metadata: Dict[str, Any] = field(default_factory=dict) 53 | created_at: datetime = field(default_factory=datetime.now) 54 | 55 | def __post_init__(self): 56 | """Generate ID if not provided and validate data""" 57 | if not self.id: 58 | self.id = str(uuid.uuid4()) 59 | if not self.name: 60 | raise ValueError("Document name is required") 61 | if not isinstance(self.pages, list): 62 | raise ValueError("Pages must be a list") 63 | 64 | @property 65 | def page_count(self) -> int: 66 | """Get total number of pages""" 67 | return len(self.pages) 68 | 69 | 70 | def get_page(self, page_number: int) -> Optional[Page]: 71 | """Get specific page by number""" 72 | for page in self.pages: 73 | if page.page_number == page_number: 74 | return page 75 | return None 76 | 77 | def get_pages_range(self, start: int, end: int) -> List[Page]: 78 | """Get pages in a range""" 79 | return [p for p in self.pages if start <= p.page_number <= end] 80 | 81 | 82 | @dataclass 83 | class QueryResult: 84 | """Result of a RAG query""" 85 | query: str 86 | answer: str 87 | selected_pages: List[Page] 88 | mode: QueryMode 89 | confidence: float = 0.0 90 | processing_time: float = 0.0 91 | metadata: Dict[str, Any] = field(default_factory=dict) 92 | total_cost: float = 0.0 # Total cost of all API calls for this query 93 | 94 | def __post_init__(self): 95 | """Validate result data""" 96 | if not self.query: 97 | raise ValueError("Query is required") 98 | if not self.answer: 99 | raise ValueError("Answer is required") 100 | if self.confidence < 0 or self.confidence > 1: 101 | raise ValueError("Confidence must be between 0 and 1") 102 | 103 | @property 104 | def page_count(self) -> int: 105 | """Number of pages used for the answer""" 106 | return len(self.selected_pages) 107 | 108 | @property 109 | def page_numbers(self) -> List[int]: 110 | """Page numbers used for the answer""" 111 | return [p.page_number for p in self.selected_pages] 112 | 113 | def get_pages_by_document(self) -> Dict[str, List[int]]: 114 | """Get pages grouped by document name""" 115 | pages_by_doc = {} 116 | for page in self.selected_pages: 117 | doc_name = page.document_name or "Unknown Document" 118 | if doc_name not in pages_by_doc: 119 | pages_by_doc[doc_name] = [] 120 | pages_by_doc[doc_name].append(page.page_number) 121 | 122 | # Sort page numbers within each document 123 | for doc_name in pages_by_doc: 124 | pages_by_doc[doc_name].sort() 125 | 126 | return pages_by_doc 127 | 128 | 129 | @dataclass 130 | class DocumentProcessRequest: 131 | """Request to process a document""" 132 | file_path: str 133 | document_id: Optional[str] = None 134 | document_name: Optional[str] = None 135 | 136 | def __post_init__(self): 137 | """Validate and set defaults""" 138 | if not self.file_path or not Path(self.file_path).exists(): 139 | raise FileNotFoundError(f"File not found: {self.file_path}") 140 | 141 | if not self.document_name: 142 | self.document_name = Path(self.file_path).stem 143 | 144 | if not self.document_id: 145 | self.document_id = str(uuid.uuid4()) 146 | 147 | 148 | @dataclass 149 | class QueryRequest: 150 | """Request to query documents""" 151 | query: str 152 | mode: QueryMode = QueryMode.AUTO 153 | document_ids: Optional[List[str]] = None 154 | max_pages: Optional[int] = None 155 | stream: bool = False 156 | 157 | def __post_init__(self): 158 | """Validate query request""" 159 | if not self.query.strip(): 160 | raise ValueError("Query cannot be empty") 161 | 162 | # Set default max_pages 163 | if self.max_pages is None: 164 | self.max_pages = 15 # Use standard page limit -------------------------------------------------------------------------------- /docpixie/ai/page_selector.py: -------------------------------------------------------------------------------- 1 | """ 2 | Vision-based page selector for DocPixie RAG Agent 3 | Selects relevant pages by analyzing page images directly with vision models 4 | """ 5 | 6 | import json 7 | import logging 8 | from typing import List, Dict, Any, Optional 9 | 10 | from ..models.document import Page 11 | from ..providers.base import BaseProvider 12 | from ..core.config import DocPixieConfig 13 | from ..exceptions import PageSelectionError 14 | from ..core.utils import sanitize_llm_json 15 | from .prompts import SYSTEM_PAGE_SELECTOR, USER_VISION_ANALYSIS, VISION_PAGE_SELECTION_PROMPT 16 | 17 | logger = logging.getLogger(__name__) 18 | 19 | 20 | class VisionPageSelector: 21 | """ 22 | Selects relevant document pages using vision model analysis 23 | Key feature: Analyzes actual page IMAGES, not text summaries 24 | """ 25 | 26 | def __init__(self, provider: BaseProvider, config: DocPixieConfig): 27 | self.provider = provider 28 | self.config = config 29 | 30 | async def select_pages_for_task( 31 | self, 32 | query: str, 33 | query_description: str, 34 | task_pages: List[Page] 35 | ) -> List[Page]: 36 | """ 37 | Select most relevant pages by analyzing page IMAGES with vision model 38 | 39 | Args: 40 | query: The question/task to find pages for 41 | task_pages: Pages from the task's assigned document 42 | 43 | Returns: 44 | List of selected pages, ordered by relevance 45 | 46 | Raises: 47 | PageSelectionError: If page selection fails 48 | """ 49 | if not task_pages: 50 | logger.warning("No pages provided for selection") 51 | return [] 52 | 53 | try: 54 | logger.info(f"Selecting most relevant pages from {len(task_pages)} task pages") 55 | 56 | # Build vision-based selection message 57 | messages = self._build_vision_selection_messages(query, query_description, task_pages) 58 | 59 | # Use vision model to analyze page images and select best ones 60 | result = await self.provider.process_multimodal_messages( 61 | messages=messages, 62 | max_tokens=200, 63 | temperature=0.1 # Low temperature for consistent selection 64 | ) 65 | 66 | # Parse selection result 67 | selected_pages = self._parse_page_selection(result, task_pages) 68 | 69 | logger.info(f"Successfully selected {len(selected_pages)} pages") 70 | return selected_pages 71 | 72 | except Exception as e: 73 | logger.error(f"Vision page selection failed: {e}") 74 | raise PageSelectionError(f"Failed to select pages for task: {e}") 75 | 76 | def _build_vision_selection_messages( 77 | self, 78 | query: str, 79 | query_description: str, 80 | all_pages: List[Page] 81 | ) -> List[Dict[str, Any]]: 82 | """ 83 | Build multimodal message with all page images for vision analysis 84 | This is the key method that makes our system vision-first 85 | """ 86 | messages = [ 87 | { 88 | "role": "system", 89 | "content": SYSTEM_PAGE_SELECTOR 90 | } 91 | ] 92 | user_content = [] 93 | # Add ALL page images to the message for vision analysis 94 | for i, page in enumerate(all_pages, 1): 95 | user_content.extend([ 96 | { 97 | "type": "image_path", 98 | "image_path": page.image_path, 99 | "detail": self.config.vision_detail 100 | }, 101 | { 102 | "type": "text", 103 | "text": f"[Page {i}]" 104 | } 105 | ]) 106 | 107 | user_content.append( 108 | { 109 | "type": "text", 110 | "text": VISION_PAGE_SELECTION_PROMPT.format(query=query, query_description=query_description) 111 | } 112 | ) 113 | 114 | messages.append( 115 | { 116 | "role": "user", 117 | "content": user_content 118 | } 119 | ) 120 | 121 | return messages 122 | 123 | def _parse_page_selection( 124 | self, 125 | result: str, 126 | all_pages: List[Page] 127 | ) -> List[Page]: 128 | """ 129 | Parse the vision model's page selection response 130 | """ 131 | try: 132 | # Parse JSON response 133 | selection_data = json.loads(sanitize_llm_json(result)) 134 | selected_indices = selection_data.get("selected_pages", []) 135 | 136 | selected_pages = [] 137 | for idx in selected_indices: 138 | if isinstance(idx, int) and 1 <= idx <= len(all_pages): 139 | page = all_pages[idx - 1] 140 | selected_pages.append(page) 141 | logger.debug(f"Selected page {idx}: {page.image_path}") 142 | 143 | # If no valid pages were selected, return empty list and raise error 144 | if not selected_pages: 145 | logger.error("No valid pages selected by vision model") 146 | raise PageSelectionError("Vision model failed to select any valid pages") 147 | 148 | return selected_pages 149 | 150 | except (json.JSONDecodeError, KeyError, TypeError) as e: 151 | logger.error(f"Failed to parse page selection JSON: {e}") 152 | logger.debug(f"Raw vision model response: {result}") 153 | 154 | raise PageSelectionError(f"Failed to parse vision model page selection response: {e}, raw response: \n{result}") 155 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DocPixie 2 | 3 | A lightweight multimodal RAG (Retrieval-Augmented Generation) library that uses vision AI instead of traditional embeddings or vector databases. DocPixie processes documents as images and uses vision language models for both document understanding and intelligent page selection. 4 | 5 | ![DocPixie Demo](screenshot.png) 6 | 7 | ## 🌟 Features 8 | 9 | - **Vision-First Approach**: Documents processed as images using PyMuPDF, preserving visual information and formatting 10 | - **No Vector Database Required**: Eliminates the complexity of embeddings and vector storage 11 | - **Adaptive RAG Agent**: Single intelligent agent that dynamically plans tasks and selects relevant pages 12 | - **Multi-Provider Support**: Works with OpenAI GPT-4V, Anthropic Claude, and OpenRouter 13 | - **Modern CLI Interface**: Beautiful terminal UI built with Textual 14 | - **Conversation Aware**: Maintains context across multiple queries 15 | - **Pluggable Storage**: Local filesystem or in-memory storage backends 16 | 17 | ## 🚀 Quick Start 18 | 19 | ### Installation 20 | 21 | ```bash 22 | # use uv (recommended) 23 | uv pip install docpixie 24 | 25 | # or pip 26 | pip install docpixie 27 | ``` 28 | 29 | Try the CLI: 30 | ```bash 31 | docpixie 32 | ``` 33 | 34 | ### Basic Usage 35 | 36 | ```python 37 | import asyncio 38 | from docpixie import DocPixie 39 | 40 | async def main(): 41 | # Initialize with your API key 42 | docpixie = DocPixie() 43 | 44 | # Add a document 45 | document = await docpixie.add_document("path/to/your/document.pdf") 46 | print(f"Added document: {document.name}") 47 | 48 | # Query the document 49 | result = await docpixie.query("What are the key findings?") 50 | print(f"Answer: {result.answer}") 51 | print(f"Pages used: {result.page_numbers}") 52 | 53 | # Run the example 54 | asyncio.run(main()) 55 | ``` 56 | 57 | ### Using the CLI 58 | 59 | Start the interactive terminal interface: 60 | 61 | ```bash 62 | docpixie 63 | ``` 64 | 65 | The CLI provides: 66 | - Interactive document chat 67 | - Document management 68 | - Conversation history 69 | - Model configuration 70 | - Command palette with shortcuts 71 | 72 | ## 🛠️ Configuration 73 | 74 | DocPixie uses environment variables for API key configuration: 75 | 76 | ```bash 77 | # For OpenAI (default) 78 | export OPENAI_API_KEY="your-openai-key" 79 | 80 | # For Anthropic Claude 81 | export ANTHROPIC_API_KEY="your-anthropic-key" 82 | 83 | # For OpenRouter (supports many models) 84 | export OPENROUTER_API_KEY="your-openrouter-key" 85 | ``` 86 | 87 | You can also specify the provider: 88 | 89 | ```python 90 | from docpixie import DocPixie, DocPixieConfig 91 | 92 | config = DocPixieConfig( 93 | provider="anthropic", # or "openai", "openrouter" 94 | model="claude-3-opus-20240229", 95 | vision_model="claude-3-opus-20240229" 96 | ) 97 | 98 | docpixie = DocPixie(config=config) 99 | ``` 100 | 101 | ## 📚 Supported File Types 102 | 103 | - **PDF files** (.pdf) - Full multipage support 104 | - More file types coming soon 105 | 106 | ## 🏗️ Architecture 107 | 108 | DocPixie uses a clean, modular architecture: 109 | 110 | ``` 111 | 📁 Core Components 112 | ├── 🧠 Adaptive RAG Agent - Dynamic task planning and execution 113 | ├── 👁️ Vision Processing - Document-to-image conversion via PyMuPDF 114 | ├── 🔌 Provider System - Unified interface for AI providers 115 | ├── 💾 Storage Backends - Local filesystem or in-memory storage 116 | └── 🖥️ CLI Interface - Modern terminal UI with Textual 117 | 118 | 📁 Processing Flow 119 | 1. Document → Images (PyMuPDF) 120 | 2. Vision-based summarization 121 | 3. Adaptive query processing 122 | 4. Intelligent page selection 123 | 5. Response synthesis 124 | ``` 125 | 126 | ### Key Design Principles 127 | 128 | - **Provider-Agnostic**: Generic model configuration works across all providers 129 | - **Image-Based Processing**: All documents converted to images, preserving visual context 130 | - **Business Logic Separation**: Raw API operations separate from workflow logic 131 | - **Adaptive Intelligence**: Single agent mode that dynamically adjusts based on findings 132 | 133 | ## 🎯 Use Cases 134 | 135 | - **Research & Analysis**: Query academic papers, reports, and research documents 136 | - **Document Q&A**: Interactive questioning of PDFs, contracts, and manuals 137 | - **Content Discovery**: Find specific information across large document collections 138 | - **Visual Document Processing**: Handle documents with charts, diagrams, and complex layouts 139 | 140 | ## 🌍 Environment Variables 141 | 142 | | Variable | Description | Default | 143 | |----------|-------------|---------| 144 | | `OPENAI_API_KEY` | OpenAI API key | None | 145 | | `ANTHROPIC_API_KEY` | Anthropic API key | None | 146 | | `OPENROUTER_API_KEY` | OpenRouter API key | None | 147 | | `DOCPIXIE_PROVIDER` | AI provider | `openai` | 148 | | `DOCPIXIE_STORAGE_PATH` | Storage directory | `./docpixie_data` | 149 | | `DOCPIXIE_JPEG_QUALITY` | Image quality (1-100) | `90` | 150 | 151 | ## 📖 Documentation 152 | 153 | - [Getting Started Guide](docs/getting-started.md) - Detailed examples and tutorials 154 | - [CLI Tool Guide](docs/cli-tool.md) - Complete CLI documentation 155 | 156 | ## 🤝 Contributing 157 | 158 | 1. Fork the repository 159 | 2. Create a feature branch (`git checkout -b feature/amazing-feature`) 160 | 3. Commit your changes (`git commit -m 'Add amazing feature'`) 161 | 4. Push to the branch (`git push origin feature/amazing-feature`) 162 | 5. Open a Pull Request 163 | 164 | ## 📄 License 165 | 166 | This project is licensed under the MIT License - see the LICENSE file for details. 167 | 168 | ## 🙏 Acknowledgments 169 | 170 | - Built with [PyMuPDF](https://pymupdf.readthedocs.io/) for PDF processing 171 | - CLI powered by [Textual](https://textual.textualize.io/) 172 | - Supports OpenAI, Anthropic, and OpenRouter APIs 173 | 174 | --- 175 | -------------------------------------------------------------------------------- /docpixie/storage/base.py: -------------------------------------------------------------------------------- 1 | """ 2 | Base storage interface for documents 3 | """ 4 | 5 | from abc import ABC, abstractmethod 6 | from typing import List, Optional, Dict, Any 7 | import logging 8 | 9 | from ..models.document import Document, Page 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | class BaseStorage(ABC): 15 | """Base class for storage backends""" 16 | 17 | @abstractmethod 18 | async def save_document(self, document: Document) -> str: 19 | """ 20 | Save a processed document 21 | 22 | Args: 23 | document: Document to save 24 | 25 | Returns: 26 | Document ID 27 | """ 28 | pass 29 | 30 | @abstractmethod 31 | async def get_document(self, document_id: str) -> Optional[Document]: 32 | """ 33 | Retrieve a document by ID 34 | 35 | Args: 36 | document_id: ID of document to retrieve 37 | 38 | Returns: 39 | Document or None if not found 40 | """ 41 | pass 42 | 43 | @abstractmethod 44 | async def list_documents(self, limit: Optional[int] = None) -> List[Dict[str, Any]]: 45 | """ 46 | List all documents with metadata 47 | 48 | Args: 49 | limit: Maximum number of documents to return 50 | 51 | Returns: 52 | List of document metadata dicts 53 | """ 54 | pass 55 | 56 | @abstractmethod 57 | async def delete_document(self, document_id: str) -> bool: 58 | """ 59 | Delete a document and its associated files 60 | 61 | Args: 62 | document_id: ID of document to delete 63 | 64 | Returns: 65 | True if deletion was successful 66 | """ 67 | pass 68 | 69 | @abstractmethod 70 | async def document_exists(self, document_id: str) -> bool: 71 | """ 72 | Check if document exists 73 | 74 | Args: 75 | document_id: Document ID to check 76 | 77 | Returns: 78 | True if document exists 79 | """ 80 | pass 81 | 82 | @abstractmethod 83 | async def get_document_summary(self, document_id: str) -> Optional[str]: 84 | """ 85 | Get document summary without loading full document 86 | 87 | Args: 88 | document_id: Document ID 89 | 90 | Returns: 91 | Document summary or None 92 | """ 93 | pass 94 | 95 | @abstractmethod 96 | async def update_document_summary(self, document_id: str, summary: str) -> bool: 97 | """ 98 | Update document summary 99 | 100 | Args: 101 | document_id: Document ID 102 | summary: New summary text 103 | 104 | Returns: 105 | True if update was successful 106 | """ 107 | pass 108 | 109 | @abstractmethod 110 | async def get_all_documents(self) -> List[Document]: 111 | """ 112 | Get all documents for agent processing 113 | 114 | Returns: 115 | List of all documents in storage 116 | """ 117 | pass 118 | 119 | @abstractmethod 120 | async def get_all_pages(self) -> List[Page]: 121 | """ 122 | Get all pages from all documents for agent processing 123 | 124 | Returns: 125 | List of all pages across all documents 126 | """ 127 | pass 128 | 129 | async def get_documents_by_ids(self, document_ids: List[str]) -> List[Document]: 130 | """ 131 | Get multiple documents by IDs 132 | 133 | Args: 134 | document_ids: List of document IDs 135 | 136 | Returns: 137 | List of documents (may be fewer than requested if some not found) 138 | """ 139 | documents = [] 140 | for doc_id in document_ids: 141 | doc = await self.get_document(doc_id) 142 | if doc: 143 | documents.append(doc) 144 | return documents 145 | 146 | async def search_documents(self, query: str, limit: int = 10) -> List[Dict[str, Any]]: 147 | """ 148 | Simple text search in document names and summaries 149 | Default implementation - subclasses can override for better search 150 | 151 | Args: 152 | query: Search query 153 | limit: Maximum results 154 | 155 | Returns: 156 | List of matching document metadata 157 | """ 158 | all_docs = await self.list_documents() 159 | matching_docs = [] 160 | query_lower = query.lower() 161 | 162 | for doc_meta in all_docs: 163 | name_match = query_lower in doc_meta.get('name', '').lower() 164 | summary_match = query_lower in doc_meta.get('summary', '').lower() 165 | 166 | if name_match or summary_match: 167 | matching_docs.append(doc_meta) 168 | 169 | if len(matching_docs) >= limit: 170 | break 171 | 172 | return matching_docs 173 | 174 | def get_storage_stats(self) -> Dict[str, Any]: 175 | """ 176 | Get storage statistics 177 | Default implementation - subclasses can override 178 | 179 | Returns: 180 | Dictionary with storage statistics 181 | """ 182 | return { 183 | 'backend': self.__class__.__name__, 184 | 'features': ['basic_storage'] 185 | } 186 | 187 | 188 | class StorageError(Exception): 189 | """Exception raised by storage operations""" 190 | 191 | def __init__(self, message: str, document_id: Optional[str] = None): 192 | self.document_id = document_id 193 | super().__init__(message) -------------------------------------------------------------------------------- /docpixie/ai/synthesizer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Response synthesizer for DocPixie RAG Agent 3 | Combines multiple task results into coherent final answers 4 | """ 5 | 6 | import logging 7 | from typing import List 8 | 9 | from ..models.agent import TaskResult 10 | from ..providers.base import BaseProvider 11 | from .prompts import SYNTHESIS_PROMPT, SYSTEM_SYNTHESIS 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | class ResponseSynthesizer: 17 | """ 18 | Synthesizes multiple task results into a comprehensive final response 19 | Key feature: Combines findings from different tasks into coherent narrative 20 | """ 21 | 22 | def __init__(self, provider: BaseProvider): 23 | self.provider = provider 24 | 25 | async def synthesize_response( 26 | self, 27 | original_query: str, 28 | task_results: List[TaskResult] 29 | ) -> str: 30 | """ 31 | Synthesize multiple task results into a final comprehensive response 32 | 33 | Args: 34 | original_query: The user's original question 35 | task_results: List of completed task results to combine 36 | 37 | Returns: 38 | Synthesized response that addresses the original query 39 | """ 40 | if not task_results: 41 | logger.warning("No task results provided for synthesis") 42 | return "I couldn't find any relevant information to answer your query." 43 | 44 | try: 45 | logger.info(f"Synthesizing response from {len(task_results)} task results") 46 | 47 | # Build results text from all task findings 48 | results_text = self._build_results_text(task_results) 49 | 50 | # Generate synthesis prompt 51 | prompt = SYNTHESIS_PROMPT.format( 52 | original_query=original_query, 53 | results_text=results_text 54 | ) 55 | 56 | messages = [ 57 | {"role": "system", "content": SYSTEM_SYNTHESIS}, 58 | {"role": "user", "content": prompt} 59 | ] 60 | 61 | # Get synthesized response 62 | result = await self.provider.process_text_messages( 63 | messages=messages, 64 | max_tokens=2048, # Longer response for synthesis 65 | temperature=0.2 # Low temperature for consistent synthesis 66 | ) 67 | 68 | logger.info("Successfully synthesized final response") 69 | return result.strip() 70 | 71 | except Exception as e: 72 | logger.error(f"Failed to synthesize response: {e}") 73 | # Fallback: return basic combination of results 74 | return self._create_fallback_response(original_query, task_results) 75 | 76 | def _build_results_text(self, task_results: List[TaskResult]) -> str: 77 | """Build formatted text from all task results""" 78 | results_sections = [] 79 | 80 | for i, result in enumerate(task_results, 1): 81 | section = f"""TASK {i}: {result.task.name} 82 | Description: {result.task.description} 83 | Analysis: {result.analysis} 84 | 85 | ---""" 86 | results_sections.append(section) 87 | 88 | return "\n".join(results_sections) 89 | 90 | def _create_fallback_response( 91 | self, 92 | original_query: str, 93 | task_results: List[TaskResult] 94 | ) -> str: 95 | """Create a simple fallback response if synthesis fails""" 96 | logger.warning("Using fallback response synthesis") 97 | 98 | response_parts = [ 99 | f"Based on my analysis of the documents, here's what I found regarding your query: {original_query}\n" 100 | ] 101 | 102 | for i, result in enumerate(task_results, 1): 103 | response_parts.append(f"**{result.task.name}:**") 104 | response_parts.append(result.analysis) 105 | 106 | if i < len(task_results): 107 | response_parts.append("") # Add blank line between results 108 | 109 | return "\n".join(response_parts) 110 | 111 | async def synthesize_single_result( 112 | self, 113 | original_query: str, 114 | task_result: TaskResult 115 | ) -> str: 116 | """ 117 | Handle synthesis for single task result (simpler case) 118 | 119 | Args: 120 | original_query: The user's original question 121 | task_result: Single task result to present 122 | 123 | Returns: 124 | Formatted response for single task 125 | """ 126 | try: 127 | # For single results, we can often just clean up the analysis 128 | # But still use synthesis prompt for consistency 129 | return await self.synthesize_response(original_query, [task_result]) 130 | 131 | except Exception as e: 132 | logger.error(f"Failed to synthesize single result: {e}") 133 | 134 | # Simple fallback for single result 135 | response = f"Based on my analysis, here's what I found regarding your query:\n\n" 136 | response += f"**{task_result.task.name}**\n{task_result.analysis}" 137 | 138 | return response 139 | 140 | def validate_synthesis_quality(self, synthesized_response: str) -> bool: 141 | """ 142 | Basic validation of synthesis quality 143 | 144 | Args: 145 | synthesized_response: The synthesized response to validate 146 | 147 | Returns: 148 | True if response meets basic quality criteria 149 | """ 150 | if not synthesized_response or not synthesized_response.strip(): 151 | return False 152 | 153 | # Check minimum length (synthesis should be substantial) 154 | if len(synthesized_response.strip()) < 50: 155 | return False 156 | 157 | # Check it doesn't just repeat the prompt 158 | if "SYNTHESIS_PROMPT" in synthesized_response: 159 | return False 160 | 161 | # Check for basic structure indicators 162 | if "I couldn't find" in synthesized_response and len(synthesized_response) < 100: 163 | return False 164 | 165 | return True 166 | -------------------------------------------------------------------------------- /docpixie/providers/openrouter.py: -------------------------------------------------------------------------------- 1 | """ 2 | OpenRouter provider for raw API operations 3 | Uses OpenAI client with OpenRouter's API endpoint 4 | """ 5 | 6 | import logging 7 | from typing import List, Dict, Any 8 | 9 | from .base import BaseProvider, ProviderError 10 | from ..core.config import DocPixieConfig 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | class OpenRouterProvider(BaseProvider): 16 | """OpenRouter provider for raw API operations""" 17 | 18 | def __init__(self, config: DocPixieConfig): 19 | super().__init__(config) 20 | 21 | if not config.openrouter_api_key: 22 | raise ValueError("OpenRouter API key is required") 23 | 24 | # Import here to make it optional dependency 25 | try: 26 | from openai import AsyncOpenAI 27 | self.client = AsyncOpenAI( 28 | api_key=config.openrouter_api_key, 29 | base_url="https://openrouter.ai/api/v1" 30 | ) 31 | except ImportError: 32 | raise ImportError("OpenAI library not found. Install with: pip install openai") 33 | 34 | self.model = config.vision_model 35 | 36 | async def process_text_messages( 37 | self, 38 | messages: List[Dict[str, Any]], 39 | max_tokens: int = 300, 40 | temperature: float = 0.3 41 | ) -> str: 42 | """Process text-only messages through OpenRouter API""" 43 | try: 44 | response = await self.client.chat.completions.create( 45 | model=self.config.model, 46 | messages=messages, 47 | max_tokens=max_tokens, 48 | temperature=temperature, 49 | extra_body= { 50 | "usage": { 51 | "include": True, 52 | }, 53 | }, 54 | ) 55 | 56 | result = response.choices[0].message.content.strip() 57 | logger.debug(f"OpenRouter text response: {result[:50]}...") 58 | 59 | # Track cost if available 60 | if hasattr(response, 'usage') and hasattr(response.usage, 'cost'): 61 | self.last_api_cost = response.usage.cost 62 | self.total_cost += response.usage.cost 63 | logger.debug(f"OpenRouter API cost: ${response.usage.cost}") 64 | else: 65 | self.last_api_cost = None 66 | 67 | return result 68 | 69 | except Exception as e: 70 | logger.error(f"OpenRouter text processing failed: {e}") 71 | raise ProviderError(f"Text processing failed: {e}", "openrouter") 72 | 73 | async def process_multimodal_messages( 74 | self, 75 | messages: List[Dict[str, Any]], 76 | max_tokens: int = 300, 77 | temperature: float = 0.3 78 | ) -> str: 79 | """Process multimodal messages (text + images) through OpenRouter API""" 80 | try: 81 | # Process messages to convert image paths to data URLs 82 | processed_messages = self._prepare_openai_messages(messages) 83 | 84 | response = await self.client.chat.completions.create( 85 | model=self.model, # Use vision model 86 | messages=processed_messages, 87 | max_tokens=max_tokens, 88 | temperature=temperature, 89 | extra_body= { 90 | "usage": { 91 | "include": True, 92 | }, 93 | }, 94 | ) 95 | 96 | result = response.choices[0].message.content.strip() 97 | logger.debug(f"OpenRouter multimodal response: {result[:50]}...") 98 | 99 | # Track cost if available 100 | if hasattr(response, 'usage') and hasattr(response.usage, 'cost'): 101 | self.last_api_cost = response.usage.cost 102 | self.total_cost += response.usage.cost 103 | logger.debug(f"OpenRouter API cost: ${response.usage.cost}") 104 | else: 105 | self.last_api_cost = None 106 | 107 | return result 108 | 109 | except Exception as e: 110 | logger.error(f"OpenRouter multimodal processing failed: {e}") 111 | raise ProviderError(f"Multimodal processing failed: {e}", "openrouter") 112 | 113 | def _prepare_openai_messages(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]: 114 | """Prepare messages for OpenRouter API by converting image paths to data URLs""" 115 | processed_messages = [] 116 | 117 | for message in messages: 118 | if message["role"] == "system": 119 | # System messages are text-only 120 | processed_messages.append(message) 121 | elif message["role"] == "user" and isinstance(message["content"], list): 122 | # User message with multimodal content 123 | processed_content = [] 124 | 125 | for content_item in message["content"]: 126 | if content_item["type"] == "text": 127 | processed_content.append(content_item) 128 | elif content_item["type"] == "image_path": 129 | # Convert image path to OpenRouter format (same as OpenAI) 130 | image_path = content_item["image_path"] 131 | if self._validate_image_path(image_path): 132 | image_data_url = self._create_image_data_url(image_path) 133 | processed_content.append({ 134 | "type": "image_url", 135 | "image_url": { 136 | "url": image_data_url, 137 | "detail": content_item.get("detail", "high") 138 | } 139 | }) 140 | else: 141 | logger.warning(f"Skipping invalid image path: {image_path}") 142 | else: 143 | # Pass through other content types 144 | processed_content.append(content_item) 145 | 146 | processed_messages.append({ 147 | "role": message["role"], 148 | "content": processed_content 149 | }) 150 | else: 151 | # Regular text message 152 | processed_messages.append(message) 153 | 154 | return processed_messages 155 | -------------------------------------------------------------------------------- /docpixie/ai/context_processor.py: -------------------------------------------------------------------------------- 1 | """ 2 | Context Processor - Handles conversation history summarization and context building 3 | """ 4 | 5 | import logging 6 | from typing import List, Tuple, Optional 7 | 8 | from ..models.agent import ConversationMessage 9 | from ..providers.base import BaseProvider 10 | from ..core.config import DocPixieConfig 11 | from ..exceptions import ContextProcessingError 12 | from .prompts import CONVERSATION_SUMMARIZATION_PROMPT 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | class ContextProcessor: 18 | """ 19 | Processes conversation history to create optimized context for RAG 20 | 21 | When conversation exceeds max_turns: 22 | - Summarizes first turns_to_summarize turns 23 | - Includes last turns_to_keep_full turns in full 24 | - Creates condensed context for query reformulation 25 | """ 26 | 27 | def __init__(self, provider: BaseProvider, config: DocPixieConfig): 28 | self.provider = provider 29 | self.max_turns_before_summary = config.max_conversation_turns 30 | self.turns_to_summarize = config.turns_to_summarize 31 | self.turns_to_keep_full = config.turns_to_keep_full 32 | 33 | async def process_conversation_context( 34 | self, 35 | messages: List[ConversationMessage], 36 | current_query: str 37 | ) -> Tuple[str, List[ConversationMessage]]: 38 | """ 39 | Process conversation history and return optimized context 40 | 41 | Args: 42 | messages: List of conversation messages 43 | current_query: The current user query 44 | 45 | Returns: 46 | Tuple of (processed_context_string, messages_for_display) 47 | 48 | Raises: 49 | ContextProcessingError: If context processing fails 50 | """ 51 | try: 52 | # Calculate number of turns (1 turn = 1 user message + 1 assistant message) 53 | turns = self._count_turns(messages) 54 | 55 | if turns <= self.max_turns_before_summary: 56 | # No summarization needed 57 | context = self._format_messages_as_context(messages) 58 | return context, messages 59 | 60 | logger.info(f"Conversation has {turns} turns, applying context summarization") 61 | 62 | # Split messages for summarization 63 | messages_to_summarize, messages_to_keep = self._split_messages_for_summary(messages) 64 | 65 | # Summarize the first part 66 | summary = await self._summarize_conversation_chunk(messages_to_summarize) 67 | 68 | # Build final context 69 | context_parts = [] 70 | 71 | # Add summary 72 | context_parts.append(f"Previous Conversation Summary:\n{summary}\n") 73 | 74 | # Add recent messages in full 75 | if messages_to_keep: 76 | context_parts.append("Recent Conversation:") 77 | context_parts.append(self._format_messages_as_context(messages_to_keep)) 78 | 79 | # Add current query 80 | context_parts.append(f"\nCurrent Query: {current_query}") 81 | 82 | final_context = "\n".join(context_parts) 83 | 84 | # Create display messages (summary + recent) 85 | summary_message = ConversationMessage( 86 | role="system", 87 | content=f"[Conversation Summary of First {self.turns_to_summarize} Turns]\n{summary}" 88 | ) 89 | display_messages = [summary_message] + messages_to_keep 90 | 91 | return final_context, display_messages 92 | 93 | except Exception as e: 94 | logger.error(f"Context processing failed: {e}") 95 | raise ContextProcessingError(f"Failed to process conversation context: {e}") 96 | 97 | def _count_turns(self, messages: List[ConversationMessage]) -> int: 98 | """Count conversation turns (user messages only)""" 99 | user_messages = sum(1 for msg in messages if msg.role == "user") 100 | return user_messages 101 | 102 | def _split_messages_for_summary( 103 | self, 104 | messages: List[ConversationMessage] 105 | ) -> Tuple[List[ConversationMessage], List[ConversationMessage]]: 106 | """Split messages into parts to summarize and keep""" 107 | # Find the split point based on turns 108 | turn_count = 0 109 | split_index = 0 110 | 111 | for i in range(0, len(messages), 2): # Process in pairs 112 | if i + 1 < len(messages) and messages[i].role == "user": 113 | turn_count += 1 114 | if turn_count == self.turns_to_summarize: 115 | split_index = i + 2 # Include the assistant response 116 | break 117 | 118 | messages_to_summarize = messages[:split_index] 119 | messages_to_keep = messages[split_index:] 120 | 121 | # Ensure we keep at most the last N turns 122 | if self.turns_to_keep_full > 0: 123 | max_messages_to_keep = self.turns_to_keep_full * 2 # Each turn has 2 messages 124 | if len(messages_to_keep) > max_messages_to_keep: 125 | messages_to_keep = messages_to_keep[-max_messages_to_keep:] 126 | 127 | return messages_to_summarize, messages_to_keep 128 | 129 | def _format_messages_as_context(self, messages: List[ConversationMessage]) -> str: 130 | """Format messages as readable context""" 131 | formatted_parts = [] 132 | 133 | for msg in messages: 134 | role = "User" if msg.role == "user" else "Assistant" 135 | formatted_parts.append(f"{role}: {msg.content}") 136 | 137 | return "\n\n".join(formatted_parts) 138 | 139 | async def _summarize_conversation_chunk(self, messages: List[ConversationMessage]) -> str: 140 | """Summarize a chunk of conversation""" 141 | try: 142 | conversation_text = self._format_messages_as_context(messages) 143 | 144 | prompt = CONVERSATION_SUMMARIZATION_PROMPT.format( 145 | conversation_text=conversation_text 146 | ) 147 | 148 | messages_for_api = [ 149 | {"role": "system", "content": "You are a helpful assistant that creates concise conversation summaries."}, 150 | {"role": "user", "content": prompt} 151 | ] 152 | 153 | summary = await self.provider.process_text_messages( 154 | messages=messages_for_api, 155 | max_tokens=500, 156 | temperature=0.3 157 | ) 158 | 159 | return summary.strip() 160 | 161 | except Exception as e: 162 | logger.error(f"Conversation summarization failed: {e}") 163 | raise ContextProcessingError(f"Failed to summarize conversation: {e}") 164 | -------------------------------------------------------------------------------- /docpixie/core/config.py: -------------------------------------------------------------------------------- 1 | """ 2 | DocPixie Configuration 3 | Simplified version of production config without embedding/vector DB settings 4 | """ 5 | 6 | import os 7 | from dataclasses import dataclass, field 8 | from typing import Tuple, Optional, Dict, Any 9 | from pathlib import Path 10 | 11 | 12 | @dataclass 13 | class DocPixieConfig: 14 | """DocPixie configuration with sensible defaults""" 15 | 16 | # Document Processing with PyMuPDF 17 | pdf_render_scale: float = 2.0 # Higher scale = better quality, larger files 18 | pdf_max_image_size: Tuple[int, int] = (1200, 1200) 19 | jpeg_quality: int = 90 20 | thumbnail_size: Tuple[int, int] = (256, 256) # For quick page selection 21 | 22 | # Processing settings 23 | vision_detail: str = "high" # Use full resolution for best quality 24 | 25 | # Storage 26 | storage_type: str = "local" # local, memory, s3 27 | local_storage_path: str = "./docpixie_data" 28 | 29 | # AI Provider Settings (Provider-agnostic) 30 | provider: str = "openai" # openai, anthropic, openrouter 31 | model: str = "gpt-4o" # Primary model for all operations 32 | vision_model: str = "gpt-4o" # Vision model for multimodal analysis 33 | 34 | # API keys loaded from environment variables only 35 | openai_api_key: Optional[str] = None 36 | anthropic_api_key: Optional[str] = None 37 | openrouter_api_key: Optional[str] = None 38 | 39 | # Agent Settings 40 | max_agent_iterations: int = 5 # Maximum adaptive planning iterations 41 | max_pages_per_task: int = 6 # Maximum pages to analyze per task 42 | max_tasks_per_plan: int = 4 # Maximum tasks in initial plan 43 | 44 | # Conversation Processing Settings 45 | max_conversation_turns: int = 8 # When to start summarizing conversation 46 | turns_to_summarize: int = 5 # How many turns to summarize 47 | turns_to_keep_full: int = 3 # How many recent turns to keep in full 48 | 49 | # Logging 50 | log_level: str = "INFO" 51 | log_requests: bool = False 52 | 53 | def __post_init__(self): 54 | """Initialize and validate configuration""" 55 | # Create storage directory if it doesn't exist 56 | if self.storage_type == "local": 57 | Path(self.local_storage_path).mkdir(parents=True, exist_ok=True) 58 | 59 | # Load API keys from environment if not provided 60 | if not self.openai_api_key: 61 | self.openai_api_key = os.getenv("OPENAI_API_KEY") 62 | 63 | if not self.anthropic_api_key: 64 | self.anthropic_api_key = os.getenv("ANTHROPIC_API_KEY") 65 | 66 | if not self.openrouter_api_key: 67 | self.openrouter_api_key = os.getenv("OPENROUTER_API_KEY") 68 | 69 | # Set provider-specific default models if using defaults 70 | self._set_provider_defaults() 71 | 72 | # Skip validation with test API keys (for testing) 73 | if self.openai_api_key != "test-key" and self.anthropic_api_key != "test-key" and self.openrouter_api_key != "test-key": 74 | # Validate required settings based on provider 75 | if self.provider == "openai" and not self.openai_api_key: 76 | raise ValueError("OpenAI API key is required when using OpenAI provider") 77 | 78 | if self.provider == "anthropic" and not self.anthropic_api_key: 79 | raise ValueError("Anthropic API key is required when using Anthropic provider") 80 | 81 | if self.provider == "openrouter" and not self.openrouter_api_key: 82 | raise ValueError("OpenRouter API key is required when using OpenRouter provider") 83 | 84 | # Validate image settings 85 | if self.pdf_render_scale <= 0: 86 | raise ValueError("PDF render scale must be positive") 87 | 88 | if self.jpeg_quality < 1 or self.jpeg_quality > 100: 89 | raise ValueError("JPEG quality must be between 1 and 100") 90 | 91 | def _set_provider_defaults(self): 92 | """Set appropriate default models based on provider""" 93 | provider_defaults = { 94 | "openai": { 95 | "model": "gpt-4o", 96 | "vision_model": "gpt-4o" 97 | }, 98 | "anthropic": { 99 | "model": "claude-3-opus-20240229", 100 | "vision_model": "claude-3-opus-20240229" 101 | }, 102 | "openrouter": { 103 | "model": "openai/gpt-4o", 104 | "vision_model": "openai/gpt-4o" 105 | } 106 | } 107 | 108 | if self.provider in provider_defaults: 109 | defaults = provider_defaults[self.provider] 110 | # Only update if still using OpenAI defaults (means user didn't specify custom models) 111 | if self.model == "gpt-4o": 112 | self.model = defaults["model"] 113 | if self.vision_model == "gpt-4o": 114 | self.vision_model = defaults["vision_model"] 115 | 116 | @classmethod 117 | def from_dict(cls, config_dict: Dict[str, Any]) -> 'DocPixieConfig': 118 | """Create config from dictionary""" 119 | return cls(**config_dict) 120 | 121 | @classmethod 122 | def from_env(cls) -> 'DocPixieConfig': 123 | """Create config from environment variables""" 124 | config_dict = {} 125 | 126 | # Map environment variables to config fields 127 | env_mapping = { 128 | 'DOCPIXIE_PROVIDER': 'provider', 129 | 'DOCPIXIE_STORAGE_PATH': 'local_storage_path', 130 | 'DOCPIXIE_JPEG_QUALITY': 'jpeg_quality', 131 | 'DOCPIXIE_LOG_LEVEL': 'log_level', 132 | } 133 | 134 | for env_var, config_field in env_mapping.items(): 135 | value = os.getenv(env_var) 136 | if value is not None: 137 | # Convert string values to appropriate types 138 | if config_field in ['jpeg_quality']: 139 | config_dict[config_field] = int(value) 140 | elif config_field in ['enable_cache']: 141 | config_dict[config_field] = value.lower() in ('true', '1', 'yes') 142 | else: 143 | config_dict[config_field] = value 144 | 145 | return cls(**config_dict) 146 | 147 | def get_query_config(self) -> Dict[str, Any]: 148 | """Get configuration for query processing""" 149 | return { 150 | 'vision_detail': self.vision_detail, 151 | 'model': self.model 152 | } 153 | 154 | def validate_provider_config(self) -> None: 155 | """Validate provider-specific configuration""" 156 | if self.provider == "openai": 157 | if not self.openai_api_key: 158 | raise ValueError("OpenAI API key is required") 159 | elif self.provider == "anthropic": 160 | if not self.anthropic_api_key: 161 | raise ValueError("Anthropic API key is required") 162 | elif self.provider == "openrouter": 163 | if not self.openrouter_api_key: 164 | raise ValueError("OpenRouter API key is required") 165 | else: 166 | raise ValueError(f"Unsupported provider: {self.provider}") 167 | -------------------------------------------------------------------------------- /docpixie/providers/anthropic.py: -------------------------------------------------------------------------------- 1 | """ 2 | Anthropic Claude provider for raw API operations 3 | """ 4 | 5 | import logging 6 | from typing import List, Dict, Any 7 | 8 | from .base import BaseProvider, ProviderError 9 | from ..core.config import DocPixieConfig 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | class AnthropicProvider(BaseProvider): 15 | """Anthropic Claude provider for raw API operations""" 16 | 17 | def __init__(self, config: DocPixieConfig): 18 | super().__init__(config) 19 | 20 | if not config.anthropic_api_key: 21 | raise ValueError("Anthropic API key is required") 22 | 23 | # Import here to make it optional dependency 24 | try: 25 | import anthropic 26 | self.client = anthropic.AsyncAnthropic(api_key=config.anthropic_api_key) 27 | except ImportError: 28 | raise ImportError("Anthropic library not found. Install with: pip install anthropic") 29 | 30 | self.model = config.vision_model # Use vision model for multimodal operations 31 | 32 | async def process_text_messages( 33 | self, 34 | messages: List[Dict[str, Any]], 35 | max_tokens: int = 300, 36 | temperature: float = 0.3 37 | ) -> str: 38 | """Process text-only messages through Anthropic API""" 39 | try: 40 | # Convert system message format for Anthropic 41 | claude_messages = self._prepare_claude_text_messages(messages) 42 | 43 | response = await self.client.messages.create( 44 | model=self.model, 45 | max_tokens=max_tokens, 46 | temperature=temperature, 47 | messages=claude_messages 48 | ) 49 | 50 | result = response.content[0].text.strip() 51 | logger.debug(f"Anthropic text response: {result[:50]}...") 52 | 53 | return result 54 | 55 | except Exception as e: 56 | logger.error(f"Anthropic text processing failed: {e}") 57 | raise ProviderError(f"Text processing failed: {e}", "anthropic") 58 | 59 | async def process_multimodal_messages( 60 | self, 61 | messages: List[Dict[str, Any]], 62 | max_tokens: int = 300, 63 | temperature: float = 0.3 64 | ) -> str: 65 | """Process multimodal messages (text + images) through Anthropic Vision API""" 66 | try: 67 | # Process messages to convert image paths to base64 68 | claude_messages = self._prepare_claude_multimodal_messages(messages) 69 | 70 | response = await self.client.messages.create( 71 | model=self.model, 72 | max_tokens=max_tokens, 73 | temperature=temperature, 74 | messages=claude_messages 75 | ) 76 | 77 | result = response.content[0].text.strip() 78 | logger.debug(f"Anthropic multimodal response: {result[:50]}...") 79 | 80 | return result 81 | 82 | except Exception as e: 83 | logger.error(f"Anthropic multimodal processing failed: {e}") 84 | raise ProviderError(f"Multimodal processing failed: {e}", "anthropic") 85 | 86 | def _prepare_claude_text_messages(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]: 87 | """Prepare text-only messages for Claude API (handle system messages)""" 88 | claude_messages = [] 89 | 90 | for message in messages: 91 | if message["role"] == "system": 92 | # Claude handles system messages differently - we'll prepend to first user message 93 | continue 94 | else: 95 | claude_messages.append(message) 96 | 97 | # Prepend system message content to first user message if present 98 | system_content = None 99 | for message in messages: 100 | if message["role"] == "system": 101 | system_content = message["content"] 102 | break 103 | 104 | if system_content and claude_messages and claude_messages[0]["role"] == "user": 105 | # Prepend system content to first user message 106 | original_content = claude_messages[0]["content"] 107 | claude_messages[0]["content"] = f"{system_content}\n\n{original_content}" 108 | 109 | return claude_messages 110 | 111 | def _prepare_claude_multimodal_messages(self, messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]: 112 | """Prepare multimodal messages for Claude API by converting image paths to base64""" 113 | claude_messages = [] 114 | system_content = None 115 | 116 | # Extract system message 117 | for message in messages: 118 | if message["role"] == "system": 119 | system_content = message["content"] 120 | break 121 | 122 | for message in messages: 123 | if message["role"] == "system": 124 | continue # Skip system message, will be prepended to user message 125 | elif message["role"] == "user" and isinstance(message["content"], list): 126 | # User message with multimodal content 127 | processed_content = [] 128 | 129 | for content_item in message["content"]: 130 | if content_item["type"] == "text": 131 | processed_content.append(content_item) 132 | elif content_item["type"] == "image_path": 133 | # Convert image path to Claude format 134 | image_path = content_item["image_path"] 135 | if self._validate_image_path(image_path): 136 | encoded_image = self._encode_image(image_path) 137 | processed_content.append({ 138 | "type": "image", 139 | "source": { 140 | "type": "base64", 141 | "media_type": "image/jpeg", 142 | "data": encoded_image 143 | } 144 | }) 145 | else: 146 | logger.warning(f"Skipping invalid image path: {image_path}") 147 | else: 148 | # Pass through other content types 149 | processed_content.append(content_item) 150 | 151 | # Prepend system content to first user message 152 | if system_content and len(claude_messages) == 0: 153 | processed_content.insert(0, { 154 | "type": "text", 155 | "text": system_content 156 | }) 157 | 158 | claude_messages.append({ 159 | "role": message["role"], 160 | "content": processed_content 161 | }) 162 | else: 163 | # Regular text message 164 | claude_messages.append(message) 165 | 166 | return claude_messages -------------------------------------------------------------------------------- /CLAUDE.md: -------------------------------------------------------------------------------- 1 | # CLAUDE.md 2 | 3 | This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. 4 | 5 | ## Project Overview 6 | 7 | DocPixie is a lightweight multimodal RAG library that uses vision AI instead of embeddings/vector databases. Documents are processed as images and analyzed using vision language models for both understanding and page selection. 8 | 9 | ## Development Commands 10 | 11 | ### Environment Setup 12 | ```bash 13 | # Set up virtual environment with uv (recommended) 14 | uv venv 15 | source .venv/bin/activate 16 | 17 | uv pip install docpixie 18 | ``` 19 | 20 | Start the CLI: 21 | ```bash 22 | docpixie 23 | ``` 24 | 25 | ## Core Architecture 26 | 27 | ### Provider System 28 | The codebase uses a clean separation between **raw API operations** and **business logic**: 29 | 30 | - **Providers** (`docpixie/providers/`): Handle only raw API calls with generic `process_text_messages()` and `process_multimodal_messages()` methods 31 | - **AI Operations** (`docpixie/ai/`): Contain all business logic, prompt construction, and workflow orchestration 32 | 33 | ### Key Architectural Principles 34 | 35 | 1. **Provider-Agnostic Configuration**: Uses generic `flash_model`, `pro_model`, `vision_model` fields that work across all providers 36 | 2. **Automatic Provider Defaults**: `DocPixieConfig._set_provider_defaults()` sets appropriate models based on selected provider 37 | 3. **Image-Based Processing**: All documents converted to images via PyMuPDF, preserving visual information 38 | 4. **Adaptive RAG Agent**: Single adaptive mode that dynamically plans and re-evaluates tasks based on findings (replaces Flash/Pro modes in Phase 2) 39 | 40 | ### Provider Implementation Pattern 41 | When adding new providers: 42 | 1. Inherit from `BaseProvider` 43 | 2. Implement only `process_text_messages()` and `process_multimodal_messages()` 44 | 3. Handle provider-specific message formatting (e.g., image_path → provider format) 45 | 4. Add to `providers/factory.py` and provider defaults in `config.py` 46 | 47 | Example: OpenRouter provider uses OpenAI client with `base_url="https://openrouter.ai/api/v1"` 48 | 49 | ### Document Processing Flow 50 | 1. **PDF → Images**: PyMuPDF converts PDF pages to optimized JPEGs 51 | 2. **Storage**: Local filesystem or in-memory storage via pluggable backends 52 | 3. **Summarization**: Vision models analyze all page images in single API call for document summary 53 | 4. **Adaptive RAG Pipeline** (Phase 2): Vision-based page selection + dynamic task planning + conversation processing 54 | 55 | ### Configuration System 56 | - Environment-first approach: API keys loaded from `OPENAI_API_KEY`, `ANTHROPIC_API_KEY`, `OPENROUTER_API_KEY` 57 | - Provider-agnostic model configuration 58 | - Agent-specific settings (max iterations, pages per task, conversation context) 59 | - Test API key support: Use `"test-key"` to bypass validation during testing 60 | 61 | ### File Structure Significance 62 | 63 | ``` 64 | docpixie/ 65 | ├── core/config.py # Central configuration with provider defaults 66 | ├── providers/ # Raw API operations only 67 | │ ├── base.py # Generic message processing interface 68 | │ ├── openai.py # OpenAI API client 69 | │ ├── anthropic.py # Claude API client (handles different system message format) 70 | │ ├── openrouter.py # OpenRouter using OpenAI client + different base_url 71 | │ └── factory.py # Provider creation and validation 72 | ├── ai/ # Business logic layer 73 | │ ├── summarizer.py # Page/document summarization workflows 74 | │ ├── agent.py # Main adaptive RAG agent orchestrator 75 | │ ├── task_planner.py # Dynamic task planning with document selection 76 | │ ├── page_selector.py # Vision-based page selection 77 | │ ├── context_processor.py # Conversation summarization 78 | │ ├── query_reformulator.py # Reference resolution 79 | │ ├── query_classifier.py # Document need classification 80 | │ ├── synthesizer.py # Response synthesis 81 | │ └── prompts.py # All AI prompts 82 | ├── processors/ # Document-to-image conversion 83 | │ ├── pdf.py # PyMuPDF implementation 84 | │ └── factory.py # Auto-detection of processor type 85 | ├── storage/ # Pluggable storage backends 86 | │ ├── local.py # Filesystem storage 87 | │ └── memory.py # In-memory storage (for testing) 88 | ├── models/ 89 | │ ├── document.py # Core data models without embeddings 90 | │ └── agent.py # Agent task/plan data models 91 | ├── exceptions.py # Custom exception classes 92 | └── __init__.py # Main API entry point 93 | ``` 94 | 95 | ## Important Implementation Details 96 | 97 | ### Configuration Testing 98 | Never use test mode flags. Instead, use test API keys (`"test-key"`) which automatically bypass validation. 99 | 100 | ### Document Summarization 101 | The critical architectural decision: document summaries use ALL page images in a single vision API call, not individual page summaries combined. This preserves visual context and document structure. 102 | 103 | ### Provider Message Format 104 | All providers receive messages with `image_path` type, then convert to their specific format: 105 | - OpenAI: `image_url` with data URL 106 | - Anthropic: `image` with base64 data 107 | - OpenRouter: Same as OpenAI 108 | 109 | ### Adaptive RAG Agent Implementation 110 | The agent operates in a single adaptive mode with dynamic task planning: 111 | 1. **Context Processing**: Summarizes conversation when > 8 turns 112 | 2. **Query Reformulation**: Resolves references using context (outputs JSON) 113 | 3. **Query Classification**: Determines if documents needed (reasoning + needs_documents) 114 | 4. **Task Planning**: Creates 2-4 focused tasks with single document assignments 115 | 5. **Adaptive Execution**: Re-evaluates and modifies task list after each completion 116 | 6. **Response Synthesis**: Combines all task findings into comprehensive response 117 | 118 | ## Environment Variables 119 | 120 | ```bash 121 | # Required for respective providers 122 | OPENAI_API_KEY=your_openai_key 123 | ANTHROPIC_API_KEY=your_anthropic_key 124 | OPENROUTER_API_KEY=your_openrouter_key 125 | 126 | # Optional configuration overrides 127 | DOCPIXIE_PROVIDER=openai|anthropic|openrouter 128 | DOCPIXIE_STORAGE_PATH=./docpixie_data 129 | DOCPIXIE_MAX_AGENT_ITERATIONS=5 130 | DOCPIXIE_JPEG_QUALITY=90 131 | ``` 132 | 133 | ## Development Guidelines 134 | 135 | ### Code Modification Priority 136 | **CRITICAL**: When implementing new features, always prioritize modifying existing code over creating new files or methods unless absolutely necessary. This maintains codebase coherence and avoids unnecessary duplication. 137 | 138 | ### Error Handling Philosophy 139 | Error handling should be simple and direct - raise appropriate custom exceptions from `docpixie/exceptions.py` instead of implementing fallback mechanisms. This ensures clear failure modes and easier debugging. 140 | 141 | ### Prompt Management 142 | All AI prompts must be centralized in `docpixie/ai/prompts.py`. This includes system prompts, user prompts, and any template strings used for AI interactions. Never embed prompts directly in component files. 143 | 144 | ### Agent Task Architecture 145 | Each agent task should be assigned to exactly **one document** (not multiple). This simplifies page selection and analysis while maintaining clear scope boundaries. 146 | -------------------------------------------------------------------------------- /docs/cli-tool.md: -------------------------------------------------------------------------------- 1 | # DocPixie CLI Tool 2 | 3 | DocPixie includes a modern, interactive terminal interface built with Textual that provides a beautiful and intuitive way to chat with your documents. 4 | 5 | ## 🚀 Quick Start 6 | 7 | ### Starting the CLI 8 | 9 | ```bash 10 | # Start the interactive CLI 11 | docpixie 12 | ``` 13 | 14 | ## 🎛️ First-Time Setup 15 | 16 | When you first run the CLI, you'll be prompted to enter your API key: 17 | 18 | ``` 19 | ┌─────────────────────────────────────────────────────────────────┐ 20 | │ Welcome to DocPixie! │ 21 | │ │ 22 | │ DocPixie needs an OpenRouter API key to work │ 23 | │ with documents. │ 24 | │ │ 25 | │ Get your API key from: │ 26 | │ https://openrouter.ai/keys │ 27 | │ │ 28 | │ [ API Key Input ] │ 29 | │ │ 30 | │ Press Enter to confirm • Press Esc to quit │ 31 | └─────────────────────────────────────────────────────────────────┘ 32 | ``` 33 | 34 | > **Note**: While the setup screen mentions OpenRouter, DocPixie CLI supports all providers (OpenAI, Anthropic, OpenRouter). You can set any provider's API key as an environment variable before starting the CLI. 35 | 36 | ## 🎨 Interface Overview 37 | 38 | The CLI interface consists of several key areas: 39 | 40 | ``` 41 | ┌─ DocPixie ──────────────────────────────────── 12:34:56 PM ─┐ 42 | │ │ 43 | │ ┌─ Chat Area ───────────────────────────────────────────┐ │ 44 | │ │ │ │ 45 | │ │ Welcome to DocPixie! │ │ 46 | │ │ 2 documents indexed and ready! │ │ 47 | │ │ │ │ 48 | │ │ Start chatting or type / for commands │ │ 49 | │ └────────────────────────────────────────────────────────┘ │ 50 | │ │ 51 | │ Status: Ready • 2 documents indexed │ 52 | │ │ 53 | │ > [ Input Area ] │ 54 | │ │ 55 | │ Enter to send • Shift+Enter for new line • Ctrl+/ commands │ 56 | │ │ 57 | ├─────────────────────────────────────────────────────────────┤ 58 | │ ^N New ^L Conversations ^O Models ^D Docs ^/ Cmds ^Q │ 59 | └─────────────────────────────────────────────────────────────┘ 60 | ``` 61 | 62 | ## ⌨️ Keyboard Shortcuts 63 | 64 | ### Global Shortcuts 65 | 66 | | Shortcut | Action | Description | 67 | |----------|--------|-------------| 68 | | `Ctrl+N` | New Conversation | Start a fresh conversation | 69 | | `Ctrl+L` | Conversations | Manage conversation history | 70 | | `Ctrl+O` | Model Config | Configure AI models/providers | 71 | | `Ctrl+D` | Documents | Manage documents | 72 | | `Ctrl+/` | Commands | Toggle command palette | 73 | | `Ctrl+Q` | Quit | Exit the application | 74 | 75 | ### Chat Input Shortcuts 76 | 77 | | Shortcut | Action | Description | 78 | |----------|--------|-------------| 79 | | `Enter` | Send Message | Submit your message | 80 | | `Shift+Enter` | New Line | Add line break in message | 81 | 82 | ## 🛠️ Command System 83 | 84 | DocPixie CLI includes a powerful command system. Type `/` to open the command palette or use slash commands directly. 85 | 86 | ### Available Commands 87 | 88 | #### `/new` - New Conversation 89 | Starts a fresh conversation, clearing chat history. 90 | 91 | ``` 92 | > /new 93 | ``` 94 | 95 | #### `/clear` - Clear Chat 96 | Clears the current chat display (conversation is still saved). 97 | 98 | ``` 99 | > /clear 100 | ``` 101 | 102 | #### `/save` - Save Conversation 103 | Manually saves the current conversation to history. 104 | 105 | ``` 106 | > /save 107 | ``` 108 | 109 | #### `/conversations` - Conversation Manager 110 | Opens the conversation management dialog where you can: 111 | - View conversation history 112 | - Load previous conversations 113 | - Delete old conversations 114 | 115 | #### `/model` - Model Configuration 116 | Opens the model selector where you can: 117 | - Switch between providers (OpenAI, Anthropic, OpenRouter) 118 | - Configure model settings 119 | - View current model status 120 | 121 | #### `/documents` - Document Manager 122 | Opens the document management interface where you can: 123 | - View indexed documents 124 | - Add new documents 125 | - Remove documents from the index 126 | - See document statistics 127 | 128 | #### `/exit` - Exit Application 129 | Saves the current conversation and exits the CLI. 130 | 131 | ``` 132 | > /exit 133 | ``` 134 | 135 | ### Command Palette 136 | 137 | Press `Ctrl+/` or type `/` to open the interactive command palette: 138 | 139 | ``` 140 | ┌─ Commands ─────────────────────────────────────────────────┐ 141 | │ │ 142 | │ > /new Start new conversation │ 143 | │ /clear Clear current chat │ 144 | │ /save Save conversation │ 145 | │ /conversations Manage conversations │ 146 | │ /model Configure AI model │ 147 | │ /documents Manage documents │ 148 | │ /exit Exit DocPixie │ 149 | │ │ 150 | └─────────────────────────────────────────────────────────────┘ 151 | ``` 152 | 153 | Use arrow keys to navigate and Enter to select a command. 154 | 155 | ## 📚 Document Management 156 | 157 | ### Adding Documents 158 | 159 | The CLI automatically discovers and indexes PDF files from a `./documents` directory in your current working directory. Simply: 160 | 161 | 1. Create a `./documents` folder 162 | 2. Copy your PDF files into it 163 | 3. Restart the CLI or use the `/documents` command to refresh 164 | ───────────────────────────────────────────────────┘ 165 | ``` 166 | 167 | ### Supported File Types 168 | 169 | - **PDF files** (.pdf) - Multi-page documents 170 | 171 | ### Features 172 | 173 | - **Auto-save**: Conversations are automatically saved 174 | - **Context awareness**: Previous messages provide context for new queries 175 | - **Search**: Find conversations by content or title 176 | - **Export**: Save conversations to text files 177 | 178 | ## 🎯 Chat Features 179 | 180 | ### Smart Document Analysis 181 | 182 | DocPixie's CLI uses an adaptive RAG agent that: 183 | 184 | 1. **Analyzes your question** to determine if documents are needed 185 | 2. **Plans tasks** dynamically based on available documents 186 | 3. **Selects relevant pages** using vision AI 187 | 4. **Synthesizes responses** from multiple sources 188 | 5. **Maintains context** across conversation turns 189 | 190 | ## ⚙️ Configuration 191 | 192 | ### CLI Settings 193 | 194 | The CLI stores settings in: 195 | - **macOS/Linux**: `~/.docpixie/config.json` 196 | - **Windows**: `%APPDATA%\.docpixie\config.json` 197 | 198 | --- 199 | 200 | The DocPixie CLI provides a powerful, interactive way to work with your documents. Its adaptive AI agent, beautiful interface, and comprehensive features make document analysis both efficient and enjoyable. 201 | 202 | Happy chatting! 🚀 203 | -------------------------------------------------------------------------------- /docpixie/cli/state_manager.py: -------------------------------------------------------------------------------- 1 | """ 2 | State management for DocPixie CLI application 3 | """ 4 | 5 | from pathlib import Path 6 | from typing import List, Optional, Any, Set 7 | from docpixie import ConversationMessage 8 | from docpixie.models.document import Document 9 | from .config import get_config_manager 10 | from .conversation_storage import ConversationStorage 11 | 12 | 13 | class AppStateManager: 14 | """Manages application state including conversations, documents, and UI state""" 15 | 16 | def __init__(self): 17 | self.indexed_documents: List[Document] = [] 18 | self.conversation_history: List[ConversationMessage] = [] 19 | self.current_conversation_id: Optional[str] = None 20 | self.documents_folder = Path("./documents") 21 | self.processing = False 22 | 23 | self.command_palette_active = False 24 | self.partial_command = "" 25 | self.default_input_hint = ( 26 | "Press / for commands • Shift+Enter: new line • Shift+Tab: switch panel" 27 | ) 28 | 29 | self.current_plan: Optional[Any] = None 30 | self.completed_tasks: Set = set() 31 | 32 | self.config_manager = get_config_manager() 33 | self.conversation_storage = ConversationStorage() 34 | 35 | def get_status_text(self) -> str: 36 | """Get current status bar text with emoji prefixes""" 37 | text_model, vision_model = self.config_manager.get_models() 38 | doc_count = len(self.indexed_documents) 39 | 40 | segments = [ 41 | f"📄: {doc_count}", 42 | f"🧠: {text_model.split('/')[-1]}", 43 | f"👁️: {vision_model.split('/')[-1]}", 44 | ] 45 | 46 | if self.current_conversation_id: 47 | conversations = self.conversation_storage.list_local_conversations() 48 | current_conv = next( 49 | (conv for conv in conversations if conv.id == self.current_conversation_id), 50 | None, 51 | ) 52 | if current_conv: 53 | # Conversation name (truncate to 20 chars, add ellipsis if longer) 54 | conv_name = current_conv.name[:20] + ("..." if len(current_conv.name) > 20 else "") 55 | segments.append(f"💬: {conv_name}") 56 | 57 | # Total cost formatting 58 | total_cost = getattr(current_conv, "total_cost", 0.0) or 0.0 59 | if total_cost < 0.01: 60 | segments.append(f"💰: {total_cost:.6f}") 61 | else: 62 | segments.append(f"💰: {total_cost:.4f}") 63 | 64 | return " | ".join(segments) 65 | 66 | def add_document(self, document: Document) -> None: 67 | """Add a document to the indexed documents list""" 68 | if not any(existing.id == document.id for existing in self.indexed_documents): 69 | self.indexed_documents.append(document) 70 | 71 | def remove_document(self, document_id: str) -> bool: 72 | """Remove a document from the indexed documents list""" 73 | for doc in self.indexed_documents[:]: 74 | if doc.id == document_id: 75 | self.indexed_documents.remove(doc) 76 | return True 77 | return False 78 | 79 | def clear_documents(self) -> None: 80 | """Clear all indexed documents""" 81 | self.indexed_documents.clear() 82 | 83 | def add_conversation_message(self, message: ConversationMessage) -> None: 84 | """Add a message to conversation history""" 85 | self.conversation_history.append(message) 86 | 87 | def limit_conversation_history(self, max_messages: int = 20) -> None: 88 | """Limit conversation history to maximum number of messages""" 89 | if len(self.conversation_history) > max_messages: 90 | self.conversation_history = self.conversation_history[-max_messages:] 91 | 92 | def clear_conversation_history(self) -> None: 93 | """Clear conversation history""" 94 | self.conversation_history = [] 95 | 96 | def set_current_conversation(self, conversation_id: Optional[str]) -> None: 97 | """Set the current conversation ID""" 98 | self.current_conversation_id = conversation_id 99 | 100 | def create_new_conversation(self) -> str: 101 | """Create a new conversation and return its ID""" 102 | doc_ids = [doc.id for doc in self.indexed_documents] 103 | self.current_conversation_id = self.conversation_storage.create_new_conversation(doc_ids) 104 | self.conversation_history = [] 105 | return self.current_conversation_id 106 | 107 | def load_conversation(self, conversation_id: str) -> bool: 108 | """Load a conversation by ID""" 109 | result = self.conversation_storage.load_conversation(conversation_id) 110 | if result: 111 | metadata, messages = result 112 | self.current_conversation_id = conversation_id 113 | self.conversation_history = messages 114 | return True 115 | return False 116 | 117 | def save_current_conversation(self) -> None: 118 | """Save the current conversation if it exists""" 119 | if self.current_conversation_id and self.conversation_history: 120 | doc_ids = [doc.id for doc in self.indexed_documents] 121 | self.conversation_storage.save_conversation( 122 | self.current_conversation_id, 123 | self.conversation_history, 124 | doc_ids 125 | ) 126 | 127 | def get_last_conversation_id(self) -> Optional[str]: 128 | """Get the ID of the last conversation""" 129 | return self.conversation_storage.get_last_conversation() 130 | 131 | def set_processing(self, processing: bool) -> None: 132 | """Set processing state""" 133 | self.processing = processing 134 | 135 | def is_processing(self) -> bool: 136 | """Check if currently processing""" 137 | return self.processing 138 | 139 | def set_command_palette_active(self, active: bool) -> None: 140 | """Set command palette active state""" 141 | self.command_palette_active = active 142 | 143 | def is_command_palette_active(self) -> bool: 144 | """Check if command palette is active""" 145 | return self.command_palette_active 146 | 147 | def set_partial_command(self, command: str) -> None: 148 | """Set partial command text""" 149 | self.partial_command = command 150 | 151 | def get_partial_command(self) -> str: 152 | """Get partial command text""" 153 | return self.partial_command 154 | 155 | def set_current_plan(self, plan: Optional[Any]) -> None: 156 | """Set current task plan""" 157 | self.current_plan = plan 158 | 159 | def get_current_plan(self) -> Optional[Any]: 160 | """Get current task plan""" 161 | return self.current_plan 162 | 163 | def clear_task_plan(self) -> None: 164 | """Clear current task plan and completed tasks""" 165 | self.current_plan = None 166 | self.completed_tasks.clear() 167 | 168 | def add_completed_task(self, task_name: str) -> None: 169 | """Mark a task as completed""" 170 | self.completed_tasks.add(task_name) 171 | 172 | def get_completed_tasks(self) -> List[str]: 173 | """Get list of completed task names""" 174 | return list(self.completed_tasks) 175 | 176 | def has_documents(self) -> bool: 177 | """Check if any documents are indexed""" 178 | return len(self.indexed_documents) > 0 179 | 180 | def has_conversation_history(self) -> bool: 181 | """Check if conversation history exists""" 182 | return len(self.conversation_history) > 0 183 | -------------------------------------------------------------------------------- /docpixie/cli/widgets/command_palette.py: -------------------------------------------------------------------------------- 1 | """ 2 | Command palette widget for DocPixie CLI 3 | Provides quick access to all commands with filtering and navigation 4 | """ 5 | 6 | from typing import List, Dict, Callable, Optional 7 | from textual.widgets import Static, ListView, ListItem, Label 8 | from textual.containers import Container, Vertical 9 | from textual.reactive import reactive 10 | from textual.message import Message 11 | from textual import events 12 | from rich.text import Text 13 | 14 | 15 | class CommandItem: 16 | """Represents a single command in the palette""" 17 | 18 | def __init__(self, command: str, description: str, handler: Callable = None): 19 | self.command = command 20 | self.description = description 21 | self.handler = handler 22 | 23 | def __str__(self) -> str: 24 | return f"{self.command} - {self.description}" 25 | 26 | 27 | class DocPixieCommandPalette(Container): 28 | """Command palette overlay widget for DocPixie""" 29 | 30 | DEFAULT_CSS = """ 31 | DocPixieCommandPalette { 32 | display: none; 33 | layer: overlay; 34 | dock: bottom; 35 | offset: 0 -4; /* Position above input area */ 36 | width: 80; 37 | height: auto; 38 | max-height: 15; 39 | background: #2d1f2d; /* match app background */ 40 | border: solid #ff99cc; /* brand pink border */ 41 | padding: 1; 42 | align: center bottom; 43 | } 44 | 45 | DocPixieCommandPalette.visible { 46 | display: block; 47 | } 48 | 49 | #command-list { 50 | height: auto; 51 | max-height: 12; 52 | scrollbar-background: #2d1f2d; 53 | scrollbar-color: #ff99cc; /* brand pink scrollbar */ 54 | } 55 | 56 | .command-item { 57 | height: 1; 58 | padding: 0 1; 59 | } 60 | 61 | .command-item.--highlight { 62 | background: #4a3344; 63 | color: $text; 64 | } 65 | 66 | .command-item-selected { 67 | background: #4a3344; 68 | border-left: thick #ff99cc; 69 | color: $text; 70 | } 71 | 72 | #filter-display { 73 | background: #2d1f2d; 74 | color: #ff99cc; 75 | height: 1; 76 | padding: 0 1; 77 | margin: 0 0 1 0; 78 | } 79 | """ 80 | 81 | COMMANDS = [ 82 | CommandItem("/new", "Start a new conversation (Ctrl+N)"), 83 | CommandItem("/conversations", "Switch between conversations (Ctrl+L)"), 84 | CommandItem("/save", "Save current conversation"), 85 | CommandItem("/clear", "Clear current chat display"), 86 | CommandItem("/model", "Configure Planning and Vision models (Ctrl+M)"), 87 | CommandItem("/documents", "Manage and index documents (Ctrl+D)"), 88 | CommandItem("/help", "Show all available commands"), 89 | CommandItem("/exit", "Exit the program (Ctrl+Q)"), 90 | ] 91 | 92 | def __init__(self, **kwargs): 93 | super().__init__(**kwargs) 94 | self.filtered_commands: List[CommandItem] = [] 95 | self.selected_index = 0 96 | self.current_filter = "" 97 | self.command_items: List[ListItem] = [] 98 | 99 | def compose(self): 100 | """Create the command palette UI""" 101 | with Vertical(): 102 | yield Static("Type to filter commands:", id="filter-display") 103 | yield ListView(id="command-list") 104 | 105 | def on_mount(self): 106 | """Initialize the command palette""" 107 | self._update_commands("") 108 | 109 | def show(self, filter_text: str = ""): 110 | """Show the command palette with optional filter""" 111 | self.current_filter = filter_text 112 | self._update_commands(filter_text) 113 | self.add_class("visible") 114 | 115 | 116 | def hide(self): 117 | """Hide the command palette""" 118 | self.remove_class("visible") 119 | self.current_filter = "" 120 | self.selected_index = 0 121 | 122 | def update_filter(self, filter_text: str): 123 | """Update the command filter""" 124 | self.current_filter = filter_text 125 | self._update_commands(filter_text) 126 | 127 | filter_display = self.query_one("#filter-display", Static) 128 | if filter_text: 129 | filter_display.update(f"Filter: {filter_text}") 130 | else: 131 | filter_display.update("Type to filter commands:") 132 | 133 | def _update_commands(self, filter_text: str): 134 | """Update the displayed commands based on filter""" 135 | if filter_text: 136 | self.filtered_commands = [ 137 | cmd for cmd in self.COMMANDS 138 | if cmd.command.lower().startswith(filter_text.lower()) 139 | ] 140 | else: 141 | self.filtered_commands = self.COMMANDS.copy() 142 | 143 | self.selected_index = 0 144 | 145 | list_view = self.query_one("#command-list", ListView) 146 | list_view.clear() 147 | 148 | self.command_items = [] 149 | for i, cmd in enumerate(self.filtered_commands): 150 | command_text = Text() 151 | command_text.append(cmd.command, style="bold #ff99cc") 152 | command_text.append(" - ", style="dim") 153 | command_text.append(cmd.description, style="white") 154 | 155 | list_item = ListItem(Static(command_text), classes="command-item") 156 | list_view.append(list_item) 157 | self.command_items.append(list_item) 158 | 159 | if self.command_items and len(self.command_items) > 0: 160 | self.selected_index = 0 161 | self.command_items[0].add_class("command-item-selected") 162 | 163 | def _highlight_selected(self): 164 | """Highlight the currently selected command""" 165 | for item in self.command_items: 166 | item.remove_class("command-item-selected") 167 | 168 | if 0 <= self.selected_index < len(self.command_items): 169 | self.command_items[self.selected_index].add_class("command-item-selected") 170 | 171 | list_view = self.query_one("#command-list", ListView) 172 | list_view.scroll_to_widget(self.command_items[self.selected_index]) 173 | 174 | def move_selection_up(self): 175 | """Move selection up""" 176 | if self.filtered_commands: 177 | self.selected_index = max(0, self.selected_index - 1) 178 | self._highlight_selected() 179 | 180 | def move_selection_down(self): 181 | """Move selection down""" 182 | if self.filtered_commands: 183 | self.selected_index = min(len(self.filtered_commands) - 1, self.selected_index + 1) 184 | self._highlight_selected() 185 | 186 | def get_selected_command(self) -> Optional[CommandItem]: 187 | """Get the currently selected command""" 188 | if 0 <= self.selected_index < len(self.filtered_commands): 189 | return self.filtered_commands[self.selected_index] 190 | return None 191 | 192 | def select_current_command(self) -> Optional[str]: 193 | """Select the current command and return its command string""" 194 | selected = self.get_selected_command() 195 | if selected: 196 | self.hide() 197 | return selected.command 198 | return None 199 | 200 | 201 | 202 | class CommandSelected(Message): 203 | """Message sent when a command is selected""" 204 | 205 | def __init__(self, command: str): 206 | self.command = command 207 | super().__init__() 208 | 209 | 210 | class CommandAutoComplete(Message): 211 | """Message sent when auto-complete is requested""" 212 | 213 | def __init__(self, command: str): 214 | self.command = command 215 | super().__init__() 216 | -------------------------------------------------------------------------------- /docpixie/processors/image.py: -------------------------------------------------------------------------------- 1 | """ 2 | Image processor for direct image files 3 | Handles JPG, PNG, WebP, and other image formats 4 | """ 5 | 6 | import asyncio 7 | import logging 8 | import tempfile 9 | import os 10 | from typing import List, Optional 11 | from pathlib import Path 12 | 13 | from PIL import Image 14 | 15 | from .base import BaseProcessor, ProcessingError 16 | from ..models.document import Document, Page, DocumentStatus 17 | from ..core.config import DocPixieConfig 18 | 19 | logger = logging.getLogger(__name__) 20 | 21 | 22 | class ImageProcessor(BaseProcessor): 23 | """Processor for image files""" 24 | 25 | SUPPORTED_EXTENSIONS = ['.jpg', '.jpeg', '.png', '.webp', '.bmp', '.tiff', '.tif'] 26 | 27 | def __init__(self, config: DocPixieConfig): 28 | super().__init__(config) 29 | self.temp_dir = None 30 | 31 | def supports(self, file_path: str) -> bool: 32 | """Check if file is a supported image format""" 33 | return Path(file_path).suffix.lower() in self.SUPPORTED_EXTENSIONS 34 | 35 | def get_supported_extensions(self) -> List[str]: 36 | """Get supported file extensions""" 37 | return self.SUPPORTED_EXTENSIONS.copy() 38 | 39 | async def process(self, file_path: str, document_id: Optional[str] = None) -> Document: 40 | """ 41 | Process image file into a single-page document 42 | 43 | Args: 44 | file_path: Path to image file 45 | document_id: Optional custom document ID 46 | 47 | Returns: 48 | Document with single page 49 | """ 50 | self._validate_file(file_path) 51 | logger.info(f"Processing image: {file_path}") 52 | 53 | try: 54 | # Create temporary directory for processed image 55 | self.temp_dir = tempfile.mkdtemp(prefix="docpixie_img_") 56 | 57 | # Process image in thread pool 58 | page = await asyncio.get_event_loop().run_in_executor( 59 | None, 60 | self._process_image_sync, 61 | file_path 62 | ) 63 | 64 | # Create document with single page 65 | document = self._create_document(file_path, [page], document_id) 66 | document.status = DocumentStatus.COMPLETED 67 | 68 | # Update page with document info 69 | for page in document.pages: 70 | page.document_name = document.name 71 | page.document_id = document.id 72 | 73 | logger.info(f"Successfully processed image: {file_path}") 74 | return document 75 | 76 | except Exception as e: 77 | logger.error(f"Failed to process image {file_path}: {e}") 78 | # Clean up temp directory on error 79 | if self.temp_dir and os.path.exists(self.temp_dir): 80 | import shutil 81 | shutil.rmtree(self.temp_dir, ignore_errors=True) 82 | raise ProcessingError(f"Image processing failed: {e}", file_path) 83 | 84 | def _process_image_sync(self, file_path: str) -> Page: 85 | """Synchronous image processing""" 86 | try: 87 | # Open and process image 88 | with Image.open(file_path) as img: 89 | # Get original dimensions 90 | original_width, original_height = img.size 91 | 92 | # Optimize image 93 | optimized_img = self._optimize_image(img) 94 | 95 | # Save optimized image 96 | output_filename = "page_001.jpg" 97 | output_path = os.path.join(self.temp_dir, output_filename) 98 | 99 | optimized_img.save( 100 | output_path, 101 | 'JPEG', 102 | quality=self.config.jpeg_quality, 103 | optimize=True 104 | ) 105 | 106 | # Get final image dimensions and file size 107 | final_width, final_height = optimized_img.size 108 | file_size = os.path.getsize(output_path) 109 | 110 | # Create page object 111 | page = Page( 112 | page_number=1, 113 | image_path=output_path, 114 | metadata={ 115 | 'original_width': original_width, 116 | 'original_height': original_height, 117 | 'final_width': final_width, 118 | 'final_height': final_height, 119 | 'file_size': file_size, 120 | 'original_format': img.format 121 | } 122 | ) 123 | 124 | return page 125 | 126 | except Image.UnidentifiedImageError as e: 127 | raise ProcessingError(f"Unrecognized image format: {e}", file_path) 128 | except Exception as e: 129 | raise ProcessingError(f"Failed to process image: {e}", file_path) 130 | 131 | def _optimize_image(self, img: Image.Image) -> Image.Image: 132 | """ 133 | Optimize image for storage and processing 134 | Same logic as PDF processor 135 | """ 136 | # Convert to RGB if necessary 137 | if img.mode in ('RGBA', 'LA', 'P'): 138 | # Create white background for transparency 139 | rgb_img = Image.new('RGB', img.size, (255, 255, 255)) 140 | if img.mode == 'RGBA': 141 | rgb_img.paste(img, mask=img.split()[-1]) # Use alpha channel as mask 142 | elif img.mode == 'P' and 'transparency' in img.info: 143 | # Handle palette mode with transparency 144 | img = img.convert('RGBA') 145 | rgb_img.paste(img, mask=img.split()[-1]) 146 | else: 147 | rgb_img.paste(img) 148 | img = rgb_img 149 | elif img.mode != 'RGB': 150 | img = img.convert('RGB') 151 | 152 | # Resize if image is too large 153 | max_width, max_height = self.config.pdf_max_image_size 154 | if img.width > max_width or img.height > max_height: 155 | # Calculate new size maintaining aspect ratio 156 | ratio = min(max_width / img.width, max_height / img.height) 157 | new_width = int(img.width * ratio) 158 | new_height = int(img.height * ratio) 159 | 160 | img = img.resize((new_width, new_height), Image.Resampling.LANCZOS) 161 | logger.debug(f"Resized image to {new_width}x{new_height}") 162 | 163 | return img 164 | 165 | def create_thumbnail(self, image_path: str) -> str: 166 | """Create thumbnail for quick page selection""" 167 | try: 168 | with Image.open(image_path) as img: 169 | # Create thumbnail 170 | thumbnail = img.copy() 171 | thumbnail.thumbnail(self.config.thumbnail_size, Image.Resampling.LANCZOS) 172 | 173 | # Save thumbnail 174 | thumb_path = image_path.replace('.jpg', '_thumb.jpg') 175 | thumbnail.save(thumb_path, 'JPEG', quality=85, optimize=True) 176 | 177 | return thumb_path 178 | 179 | except Exception as e: 180 | logger.error(f"Failed to create thumbnail for {image_path}: {e}") 181 | return image_path # Return original if thumbnail creation fails 182 | 183 | def get_image_metadata(self, file_path: str) -> dict: 184 | """Extract image metadata""" 185 | try: 186 | with Image.open(file_path) as img: 187 | metadata = { 188 | 'format': img.format, 189 | 'mode': img.mode, 190 | 'width': img.width, 191 | 'height': img.height, 192 | 'has_transparency': img.mode in ('RGBA', 'LA') or 'transparency' in img.info 193 | } 194 | 195 | # Add EXIF data if available 196 | if hasattr(img, '_getexif') and img._getexif() is not None: 197 | exif = img._getexif() 198 | metadata['exif'] = exif 199 | 200 | return metadata 201 | 202 | except Exception as e: 203 | logger.error(f"Failed to extract image metadata: {e}") 204 | return {} -------------------------------------------------------------------------------- /docpixie/cli/event_handlers.py: -------------------------------------------------------------------------------- 1 | """ 2 | Event handling mixins for DocPixie CLI 3 | """ 4 | 5 | from typing import TYPE_CHECKING 6 | from textual import events 7 | from textual.widgets import TextArea, Label 8 | from .widgets import ( 9 | CommandPalette, CommandSelected, CommandAutoComplete, 10 | ConversationSelected, ConversationDeleted, 11 | ModelSelected, DocumentRemoved, DocumentsIndexed, 12 | ChatArea 13 | ) 14 | 15 | if TYPE_CHECKING: 16 | from .app import DocPixieTUI 17 | 18 | 19 | class CommandEventMixin: 20 | """Handles command palette and text input events""" 21 | 22 | async def on_text_area_changed(self: 'DocPixieTUI', event: TextArea.Changed) -> None: 23 | """Handle text area changes for command palette""" 24 | if event.text_area.id != "chat-input": 25 | return 26 | 27 | lines = event.text_area.text.split('\\n') 28 | if lines: 29 | current_line = lines[-1] if lines else "" 30 | 31 | if current_line.startswith("/"): 32 | command_palette = self.query_one("#command-palette", CommandPalette) 33 | if not self.state_manager.command_palette_active: 34 | self.state_manager.command_palette_active = True 35 | command_palette.show(current_line) 36 | else: 37 | command_palette.update_filter(current_line) 38 | else: 39 | if self.state_manager.command_palette_active: 40 | command_palette = self.query_one("#command-palette", CommandPalette) 41 | command_palette.hide() 42 | self.state_manager.command_palette_active = False 43 | 44 | async def on_key(self: 'DocPixieTUI', event: events.Key) -> None: 45 | """Handle key events for command palette navigation""" 46 | if self.state_manager.command_palette_active: 47 | command_palette = self.query_one("#command-palette", CommandPalette) 48 | 49 | if event.key == "escape": 50 | command_palette.hide() 51 | self.state_manager.command_palette_active = False 52 | text_area = self.query_one("#chat-input") 53 | text_area.clear() 54 | event.prevent_default() 55 | 56 | elif event.key == "up": 57 | command_palette.move_selection_up() 58 | event.prevent_default() 59 | 60 | elif event.key == "down": 61 | command_palette.move_selection_down() 62 | event.prevent_default() 63 | 64 | elif event.key == "tab": 65 | selected = command_palette.get_selected_command() 66 | if selected: 67 | text_area = self.query_one("#chat-input") 68 | text_area.text = selected.command 69 | text_area.cursor_location = (0, len(selected.command)) 70 | event.prevent_default() 71 | 72 | async def on_command_selected(self: 'DocPixieTUI', event: CommandSelected) -> None: 73 | """Handle command selection from palette""" 74 | command_palette = self.query_one("#command-palette", CommandPalette) 75 | command_palette.hide() 76 | self.state_manager.command_palette_active = False 77 | 78 | text_area = self.query_one("#chat-input") 79 | text_area.clear() 80 | 81 | await self.handle_command(event.command) 82 | 83 | async def on_command_auto_complete(self: 'DocPixieTUI', event: CommandAutoComplete) -> None: 84 | """Handle command auto-completion""" 85 | text_area = self.query_one("#chat-input") 86 | text_area.text = event.command 87 | text_area.cursor_location = (0, len(event.command)) 88 | 89 | 90 | class ConversationEventMixin: 91 | """Handles conversation-related events""" 92 | 93 | async def on_conversation_selected(self: 'DocPixieTUI', event: ConversationSelected) -> None: 94 | """Handle conversation selection from dialog""" 95 | chat_log = self.query_one("#chat-log", ChatArea) 96 | 97 | if event.conversation_id == "new": 98 | await self.handle_command("/new") 99 | return 100 | 101 | try: 102 | self.state_manager.save_current_conversation() 103 | 104 | if self.state_manager.load_conversation(event.conversation_id): 105 | conversations = self.state_manager.conversation_storage.list_local_conversations() 106 | metadata = next( 107 | (conv for conv in conversations if conv.id == event.conversation_id), 108 | None 109 | ) 110 | 111 | chat_log.clear() 112 | 113 | for msg in self.state_manager.conversation_history: 114 | if msg.role == "user": 115 | chat_log.add_user_message(msg.content) 116 | else: 117 | chat_log.add_assistant_message(msg.content) 118 | 119 | status_label = self.query_one("#status-label", Label) 120 | status_label.update(self.state_manager.get_status_text()) 121 | 122 | conv_name = metadata.name if metadata else "Unknown" 123 | chat_log.write(f"[green bold]●[/green bold] Loaded conversation: {conv_name}\n\n") 124 | else: 125 | chat_log.write("[red bold]●[/red bold] Failed to load conversation\n\n") 126 | 127 | except Exception as e: 128 | chat_log.write(f"[red bold]●[/red bold] Error loading conversation: {e}\n\n") 129 | 130 | async def on_conversation_deleted(self: 'DocPixieTUI', event: ConversationDeleted) -> None: 131 | """Handle conversation deletion""" 132 | chat_log = self.query_one("#chat-log", ChatArea) 133 | chat_log.write("[green bold]●[/green bold] Conversation deleted\n\n") 134 | 135 | 136 | class ModelEventMixin: 137 | """Handles model selection events""" 138 | 139 | async def on_model_selected(self: 'DocPixieTUI', event: ModelSelected) -> None: 140 | """Handle model selection""" 141 | chat_log = self.query_one("#chat-log", ChatArea) 142 | 143 | if event.old_text_model and event.text_model != event.old_text_model: 144 | chat_log.write(f"[green bold]●[/green bold] Action model switched to {event.text_model}\n\n") 145 | await self.docpixie_manager.switch_models() 146 | elif event.old_vision_model and event.vision_model != event.old_vision_model: 147 | chat_log.write(f"[green bold]●[/green bold] Vision model switched to {event.vision_model}\n\n") 148 | await self.docpixie_manager.switch_models() 149 | else: 150 | chat_log.write("[dim]No model changes made[/dim]\n\n") 151 | 152 | status_label = self.query_one("#status-label", Label) 153 | status_label.update(self.state_manager.get_status_text()) 154 | 155 | 156 | class DocumentEventMixin: 157 | """Handles document management events""" 158 | 159 | async def on_document_removed(self: 'DocPixieTUI', event: DocumentRemoved) -> None: 160 | """Handle document removal""" 161 | chat_log = self.query_one("#chat-log", ChatArea) 162 | 163 | removed_count = 0 164 | for doc_id in event.document_ids: 165 | if self.state_manager.remove_document(doc_id): 166 | removed_count += 1 167 | 168 | if self.docpixie: 169 | try: 170 | success = self.docpixie_manager.delete_document_sync(doc_id) 171 | if not success: 172 | doc_name = f"Document {doc_id}" # Fallback name 173 | chat_log.write(f"[warning]Warning: Could not delete {doc_name} from storage[/warning]\n") 174 | except Exception as e: 175 | doc_name = f"Document {doc_id}" # Fallback name 176 | chat_log.write(f"[error]Error deleting {doc_name}: {e}[/error]\n") 177 | 178 | if removed_count == 1: 179 | chat_log.write(f"[green bold]●[/green bold] Removed 1 document from index\n\n") 180 | else: 181 | chat_log.write(f"[green bold]●[/green bold] Removed {removed_count} documents from index\n\n") 182 | 183 | status_label = self.query_one("#status-label", Label) 184 | status_label.update(self.state_manager.get_status_text()) 185 | 186 | async def on_documents_indexed(self: 'DocPixieTUI', event: DocumentsIndexed) -> None: 187 | """Handle documents being indexed""" 188 | chat_log = self.query_one("#chat-log", ChatArea) 189 | 190 | indexed_count = 0 191 | for doc in event.documents: 192 | if not any(existing.id == doc.id for existing in self.state_manager.indexed_documents): 193 | self.state_manager.add_document(doc) 194 | indexed_count += 1 195 | 196 | if indexed_count == 1: 197 | chat_log.write(f"[green bold]●[/green bold] Successfully indexed 1 document\n\n") 198 | else: 199 | chat_log.write(f"[green bold]●[/green bold] Successfully indexed {indexed_count} documents\n\n") 200 | 201 | status_label = self.query_one("#status-label", Label) 202 | status_label.update(self.state_manager.get_status_text()) 203 | -------------------------------------------------------------------------------- /docpixie/processors/pdf.py: -------------------------------------------------------------------------------- 1 | """ 2 | PyMuPDF-based PDF processor 3 | Replacement for pdf2image with better performance and quality 4 | """ 5 | 6 | import asyncio 7 | import logging 8 | from typing import List, Optional, Tuple 9 | from pathlib import Path 10 | import tempfile 11 | import os 12 | 13 | from PIL import Image 14 | import fitz # PyMuPDF 15 | 16 | from .base import BaseProcessor, ProcessingError 17 | from ..models.document import Document, Page, DocumentStatus 18 | from ..core.config import DocPixieConfig 19 | 20 | logger = logging.getLogger(__name__) 21 | 22 | 23 | class PDFProcessor(BaseProcessor): 24 | """PDF processor using PyMuPDF for better performance""" 25 | 26 | SUPPORTED_EXTENSIONS = ['.pdf'] 27 | 28 | def __init__(self, config: DocPixieConfig): 29 | super().__init__(config) 30 | self.temp_dir = None 31 | 32 | def supports(self, file_path: str) -> bool: 33 | """Check if file is a PDF""" 34 | return Path(file_path).suffix.lower() in self.SUPPORTED_EXTENSIONS 35 | 36 | def get_supported_extensions(self) -> List[str]: 37 | """Get supported file extensions""" 38 | return self.SUPPORTED_EXTENSIONS.copy() 39 | 40 | async def process(self, file_path: str, document_id: Optional[str] = None) -> Document: 41 | """ 42 | Process PDF into document pages using PyMuPDF 43 | 44 | Args: 45 | file_path: Path to PDF file 46 | document_id: Optional custom document ID 47 | 48 | Returns: 49 | Document with processed pages 50 | """ 51 | self._validate_file(file_path) 52 | logger.info(f"Processing PDF: {file_path}") 53 | 54 | try: 55 | # Create temporary directory for page images 56 | self.temp_dir = tempfile.mkdtemp(prefix="docpixie_pdf_") 57 | 58 | # Process PDF in thread pool (PyMuPDF is not async) 59 | pages = await asyncio.get_event_loop().run_in_executor( 60 | None, 61 | self._process_pdf_sync, 62 | file_path 63 | ) 64 | 65 | # Create document 66 | document = self._create_document(file_path, pages, document_id) 67 | document.status = DocumentStatus.COMPLETED 68 | 69 | # Update pages with document info 70 | for page in document.pages: 71 | page.document_name = document.name 72 | page.document_id = document.id 73 | 74 | logger.info(f"Successfully processed PDF: {len(pages)} pages") 75 | return document 76 | 77 | except Exception as e: 78 | logger.error(f"Failed to process PDF {file_path}: {e}") 79 | # Clean up temp directory on error 80 | if self.temp_dir and os.path.exists(self.temp_dir): 81 | import shutil 82 | shutil.rmtree(self.temp_dir, ignore_errors=True) 83 | raise ProcessingError(f"PDF processing failed: {e}", file_path) 84 | 85 | def _process_pdf_sync(self, file_path: str) -> List[Page]: 86 | """Synchronous PDF processing with PyMuPDF""" 87 | pages = [] 88 | 89 | try: 90 | # Open PDF document 91 | pdf_doc = fitz.open(file_path) 92 | total_pages = pdf_doc.page_count 93 | 94 | logger.info(f"Processing {total_pages} pages from PDF") 95 | 96 | for page_num in range(total_pages): 97 | try: 98 | # Get page 99 | page = pdf_doc[page_num] 100 | 101 | # Create transformation matrix for scaling 102 | matrix = fitz.Matrix( 103 | self.config.pdf_render_scale, 104 | self.config.pdf_render_scale 105 | ) 106 | 107 | # Render page to pixmap 108 | pix = page.get_pixmap( 109 | matrix=matrix, 110 | alpha=False # No transparency for JPEG 111 | ) 112 | 113 | # Convert to PIL Image 114 | img_data = pix.tobytes("ppm") 115 | img = Image.open(io.BytesIO(img_data)) 116 | 117 | # Optimize image 118 | optimized_img = self._optimize_image(img) 119 | 120 | # Save page image 121 | page_filename = f"page_{page_num + 1:03d}.jpg" 122 | page_image_path = os.path.join(self.temp_dir, page_filename) 123 | 124 | optimized_img.save( 125 | page_image_path, 126 | 'JPEG', 127 | quality=self.config.jpeg_quality, 128 | optimize=True 129 | ) 130 | 131 | # Create page object 132 | page_obj = Page( 133 | page_number=page_num + 1, 134 | image_path=page_image_path, 135 | metadata={ 136 | 'width': pix.width, 137 | 'height': pix.height, 138 | 'file_size': os.path.getsize(page_image_path) 139 | } 140 | ) 141 | 142 | pages.append(page_obj) 143 | 144 | except Exception as e: 145 | logger.error(f"Failed to process page {page_num + 1}: {e}") 146 | raise ProcessingError( 147 | f"Failed to process page {page_num + 1}: {e}", 148 | file_path, 149 | page_num + 1 150 | ) 151 | 152 | pdf_doc.close() 153 | return pages 154 | 155 | except fitz.FileDataError as e: 156 | raise ProcessingError(f"Invalid PDF file: {e}", file_path) 157 | except fitz.FileNotFoundError as e: 158 | raise ProcessingError(f"PDF file not found: {e}", file_path) 159 | except Exception as e: 160 | raise ProcessingError(f"Unexpected error processing PDF: {e}", file_path) 161 | 162 | def _optimize_image(self, img: Image.Image) -> Image.Image: 163 | """ 164 | Optimize image for storage and processing 165 | Adapted from existing resize_image_for_upload logic 166 | """ 167 | # Convert to RGB if necessary 168 | if img.mode in ('RGBA', 'LA', 'P'): 169 | # Create white background 170 | rgb_img = Image.new('RGB', img.size, (255, 255, 255)) 171 | if img.mode == 'RGBA': 172 | rgb_img.paste(img, mask=img.split()[-1]) # Use alpha channel as mask 173 | else: 174 | rgb_img.paste(img) 175 | img = rgb_img 176 | elif img.mode != 'RGB': 177 | img = img.convert('RGB') 178 | 179 | # Resize if image is too large 180 | max_width, max_height = self.config.pdf_max_image_size 181 | if img.width > max_width or img.height > max_height: 182 | # Calculate new size maintaining aspect ratio 183 | ratio = min(max_width / img.width, max_height / img.height) 184 | new_width = int(img.width * ratio) 185 | new_height = int(img.height * ratio) 186 | 187 | img = img.resize((new_width, new_height), Image.Resampling.LANCZOS) 188 | logger.debug(f"Resized image to {new_width}x{new_height}") 189 | 190 | return img 191 | 192 | def create_thumbnail(self, image_path: str) -> str: 193 | """Create thumbnail for quick page selection""" 194 | try: 195 | with Image.open(image_path) as img: 196 | # Create thumbnail 197 | thumbnail = img.copy() 198 | thumbnail.thumbnail(self.config.thumbnail_size, Image.Resampling.LANCZOS) 199 | 200 | # Save thumbnail 201 | thumb_path = image_path.replace('.jpg', '_thumb.jpg') 202 | thumbnail.save(thumb_path, 'JPEG', quality=85, optimize=True) 203 | 204 | return thumb_path 205 | 206 | except Exception as e: 207 | logger.error(f"Failed to create thumbnail for {image_path}: {e}") 208 | return image_path # Return original if thumbnail creation fails 209 | 210 | def get_pdf_metadata(self, file_path: str) -> dict: 211 | """Extract PDF metadata""" 212 | try: 213 | pdf_doc = fitz.open(file_path) 214 | metadata = pdf_doc.metadata 215 | page_count = pdf_doc.page_count 216 | pdf_doc.close() 217 | 218 | return { 219 | 'title': metadata.get('title', ''), 220 | 'author': metadata.get('author', ''), 221 | 'subject': metadata.get('subject', ''), 222 | 'creator': metadata.get('creator', ''), 223 | 'producer': metadata.get('producer', ''), 224 | 'creation_date': metadata.get('creationDate', ''), 225 | 'modification_date': metadata.get('modDate', ''), 226 | 'page_count': page_count 227 | } 228 | except Exception as e: 229 | logger.error(f"Failed to extract PDF metadata: {e}") 230 | return {} 231 | 232 | 233 | # Import io for BytesIO 234 | import io -------------------------------------------------------------------------------- /docpixie/storage/memory.py: -------------------------------------------------------------------------------- 1 | """ 2 | In-memory storage backend for testing 3 | """ 4 | 5 | import asyncio 6 | from typing import List, Dict, Any, Optional 7 | from datetime import datetime 8 | import logging 9 | import copy 10 | 11 | from .base import BaseStorage, StorageError 12 | from ..models.document import Document, Page 13 | from ..core.config import DocPixieConfig 14 | 15 | logger = logging.getLogger(__name__) 16 | 17 | 18 | class InMemoryStorage(BaseStorage): 19 | """In-memory storage backend for testing and development""" 20 | 21 | def __init__(self, config: DocPixieConfig): 22 | self.config = config 23 | self._documents: Dict[str, Document] = {} 24 | self._document_summaries: Dict[str, str] = {} 25 | self._created_at = datetime.now() 26 | logger.info("Initialized in-memory storage") 27 | 28 | async def save_document(self, document: Document) -> str: 29 | """Save document to memory""" 30 | try: 31 | # Deep copy to avoid external modifications 32 | stored_document = copy.deepcopy(document) 33 | 34 | # Store document 35 | self._documents[document.id] = stored_document 36 | 37 | # Store summary separately for quick access 38 | if document.summary: 39 | self._document_summaries[document.id] = document.summary 40 | 41 | logger.info(f"Saved document {document.id} to memory ({len(document.pages)} pages)") 42 | return document.id 43 | 44 | except Exception as e: 45 | logger.error(f"Failed to save document {document.id} to memory: {e}") 46 | raise StorageError(f"Failed to save document: {e}", document.id) 47 | 48 | async def get_document(self, document_id: str) -> Optional[Document]: 49 | """Retrieve document from memory""" 50 | try: 51 | document = self._documents.get(document_id) 52 | if document: 53 | # Return a deep copy to avoid external modifications 54 | return copy.deepcopy(document) 55 | return None 56 | 57 | except Exception as e: 58 | logger.error(f"Failed to get document {document_id} from memory: {e}") 59 | raise StorageError(f"Failed to get document: {e}", document_id) 60 | 61 | async def list_documents(self, limit: Optional[int] = None) -> List[Dict[str, Any]]: 62 | """List all documents in memory""" 63 | try: 64 | documents = [] 65 | 66 | for doc_id, document in self._documents.items(): 67 | doc_info = { 68 | 'id': document.id, 69 | 'name': document.name, 70 | 'summary': self._document_summaries.get(doc_id), 71 | 'page_count': len(document.pages), 72 | 'created_at': document.created_at.isoformat(), 73 | 'updated_at': document.created_at.isoformat(), # No update tracking in memory 74 | 'status': document.status.value 75 | } 76 | documents.append(doc_info) 77 | 78 | if limit and len(documents) >= limit: 79 | break 80 | 81 | # Sort by creation time (newest first) 82 | documents.sort(key=lambda x: x['created_at'], reverse=True) 83 | return documents 84 | 85 | except Exception as e: 86 | logger.error(f"Failed to list documents in memory: {e}") 87 | raise StorageError(f"Failed to list documents: {e}") 88 | 89 | async def delete_document(self, document_id: str) -> bool: 90 | """Delete document from memory""" 91 | try: 92 | if document_id in self._documents: 93 | del self._documents[document_id] 94 | self._document_summaries.pop(document_id, None) 95 | logger.info(f"Deleted document {document_id} from memory") 96 | return True 97 | else: 98 | logger.warning(f"Document {document_id} not found in memory") 99 | return False 100 | 101 | except Exception as e: 102 | logger.error(f"Failed to delete document {document_id} from memory: {e}") 103 | raise StorageError(f"Failed to delete document: {e}", document_id) 104 | 105 | async def document_exists(self, document_id: str) -> bool: 106 | """Check if document exists in memory""" 107 | return document_id in self._documents 108 | 109 | async def get_document_summary(self, document_id: str) -> Optional[str]: 110 | """Get document summary from memory""" 111 | return self._document_summaries.get(document_id) 112 | 113 | async def update_document_summary(self, document_id: str, summary: str) -> bool: 114 | """Update document summary in memory""" 115 | try: 116 | if document_id in self._documents: 117 | # Update summary in both document and summary cache 118 | self._documents[document_id].summary = summary 119 | self._document_summaries[document_id] = summary 120 | logger.info(f"Updated summary for document {document_id} in memory") 121 | return True 122 | else: 123 | logger.warning(f"Document {document_id} not found for summary update") 124 | return False 125 | 126 | except Exception as e: 127 | logger.error(f"Failed to update summary for {document_id} in memory: {e}") 128 | return False 129 | 130 | async def search_documents(self, query: str, limit: int = 10) -> List[Dict[str, Any]]: 131 | """Search documents in memory""" 132 | try: 133 | matching_docs = [] 134 | query_lower = query.lower() 135 | 136 | for doc_id, document in self._documents.items(): 137 | # Check name match 138 | name_match = query_lower in document.name.lower() 139 | 140 | # Check summary match 141 | summary = self._document_summaries.get(doc_id, '') 142 | summary_match = query_lower in summary.lower() 143 | 144 | if name_match or summary_match: 145 | doc_info = { 146 | 'id': document.id, 147 | 'name': document.name, 148 | 'summary': summary, 149 | 'page_count': len(document.pages), 150 | 'created_at': document.created_at.isoformat(), 151 | 'status': document.status.value, 152 | 'relevance_score': self._calculate_relevance( 153 | query_lower, document, summary 154 | ) 155 | } 156 | matching_docs.append(doc_info) 157 | 158 | if len(matching_docs) >= limit: 159 | break 160 | 161 | # Sort by relevance score 162 | matching_docs.sort(key=lambda x: x['relevance_score'], reverse=True) 163 | return matching_docs 164 | 165 | except Exception as e: 166 | logger.error(f"Failed to search documents in memory: {e}") 167 | return [] 168 | 169 | def _calculate_relevance(self, query: str, document: Document, summary: str) -> float: 170 | """Calculate simple relevance score for search results""" 171 | score = 0.0 172 | 173 | # Name matches are highly relevant 174 | if query in document.name.lower(): 175 | score += 10.0 176 | 177 | # Summary matches are relevant 178 | summary_matches = summary.lower().count(query) 179 | score += summary_matches * 2.0 180 | 181 | 182 | return score 183 | 184 | async def get_all_documents(self) -> List[Document]: 185 | """Get all documents for agent processing""" 186 | return list(self._documents.values()) 187 | 188 | async def get_all_pages(self) -> List[Page]: 189 | """Get all pages from all documents for agent processing""" 190 | all_pages = [] 191 | for document in self._documents.values(): 192 | if document.pages: 193 | all_pages.extend(document.pages) 194 | return all_pages 195 | 196 | def get_storage_stats(self) -> Dict[str, Any]: 197 | """Get storage statistics""" 198 | try: 199 | total_pages = sum(len(doc.pages) for doc in self._documents.values()) 200 | 201 | return { 202 | 'backend': 'InMemoryStorage', 203 | 'total_documents': len(self._documents), 204 | 'total_pages': total_pages, 205 | 'created_at': self._created_at.isoformat(), 206 | 'features': ['in_memory', 'fast_access', 'search', 'testing'] 207 | } 208 | 209 | except Exception as e: 210 | return { 211 | 'backend': 'InMemoryStorage', 212 | 'error': str(e) 213 | } 214 | 215 | def clear_all(self): 216 | """Clear all documents (useful for testing)""" 217 | self._documents.clear() 218 | self._document_summaries.clear() 219 | logger.info("Cleared all documents from memory") 220 | 221 | def get_document_count(self) -> int: 222 | """Get total number of documents in memory""" 223 | return len(self._documents) 224 | 225 | def get_total_pages(self) -> int: 226 | """Get total number of pages across all documents""" 227 | return sum(len(doc.pages) for doc in self._documents.values()) -------------------------------------------------------------------------------- /docpixie/cli/docpixie_manager.py: -------------------------------------------------------------------------------- 1 | """ 2 | DocPixie integration manager for CLI application 3 | """ 4 | 5 | import asyncio 6 | from typing import TYPE_CHECKING, Optional, Any, Callable 7 | from pathlib import Path 8 | 9 | from docpixie import DocPixie, ConversationMessage 10 | from docpixie.core.config import DocPixieConfig 11 | from docpixie.models.document import Document 12 | 13 | from .config import get_config_manager 14 | from .state_manager import AppStateManager 15 | from .widgets import ChatArea, DocumentManagerDialog 16 | 17 | if TYPE_CHECKING: 18 | from .app import DocPixieTUI 19 | 20 | 21 | class DocPixieManager: 22 | """Manages DocPixie instance and all related operations""" 23 | 24 | def __init__(self, app: 'DocPixieTUI', state_manager: AppStateManager): 25 | self.app = app 26 | self.state_manager = state_manager 27 | self.config_manager = get_config_manager() 28 | self.docpixie: Optional[DocPixie] = None 29 | 30 | async def create_docpixie_instance(self) -> bool: 31 | try: 32 | api_key = self.config_manager.get_api_key() 33 | if not api_key: 34 | return False 35 | 36 | text_model, vision_model = self.config_manager.get_models() 37 | 38 | config = DocPixieConfig( 39 | provider="openrouter", 40 | model=text_model, 41 | vision_model=vision_model, 42 | storage_type="local", 43 | local_storage_path="./.docpixie/documents", 44 | openrouter_api_key=api_key, 45 | jpeg_quality=85, 46 | max_pages_per_task=4 47 | ) 48 | 49 | self.docpixie = DocPixie(config=config) 50 | self.app.docpixie = self.docpixie 51 | return True 52 | 53 | except Exception as e: 54 | try: 55 | chat_log = self.app.query_one("#chat-log", ChatArea) 56 | chat_log.write(f"[error]❌ Failed to create DocPixie instance: {e}[/error]") 57 | except: 58 | pass 59 | return False 60 | 61 | async def initialize_docpixie(self, show_welcome: bool = True) -> None: 62 | chat_log = self.app.query_one("#chat-log", ChatArea) 63 | 64 | if not await self.create_docpixie_instance(): 65 | chat_log.write("[error]❌ No API key configured. Please restart and configure.[/error]") 66 | return 67 | 68 | try: 69 | await self.check_and_prompt_for_documents() 70 | await self.load_or_create_conversation() 71 | 72 | if show_welcome: 73 | self.app.show_welcome_message() 74 | 75 | if self.state_manager.current_conversation_id and self.state_manager.conversation_history: 76 | chat_log.add_static_text("[dim]━━━ Restored previous conversation ━━━[/dim]\n\n") 77 | 78 | for msg in self.state_manager.conversation_history: 79 | if msg.role == "user": 80 | chat_log.add_user_message(msg.content) 81 | else: 82 | chat_log.add_assistant_message(msg.content) 83 | 84 | chat_log.add_static_text("[dim]━━━ Continue your conversation below ━━━[/dim]\n\n") 85 | 86 | except Exception as e: 87 | chat_log.write(f"[error]❌ Failed to initialize: {e}[/error]") 88 | 89 | async def switch_models(self) -> None: 90 | await self.create_docpixie_instance() 91 | 92 | async def check_and_prompt_for_documents(self) -> None: 93 | chat_log = self.app.query_one("#chat-log", ChatArea) 94 | 95 | if not self.state_manager.documents_folder.exists(): 96 | self.state_manager.documents_folder.mkdir(parents=True) 97 | chat_log.write(f"[green bold]●[/green bold] Created documents folder: {self.state_manager.documents_folder.absolute()}\n") 98 | chat_log.write("[blue bold]●[/blue bold] Add PDF files to the ./documents folder or use /documents to manage them.\n") 99 | # Auto-open the Document Manager when the folder is first created 100 | await self.app.push_screen(DocumentManagerDialog( 101 | self.state_manager.documents_folder, 102 | self.docpixie 103 | )) 104 | return 105 | 106 | self.state_manager.clear_documents() 107 | 108 | try: 109 | existing_docs = await self.docpixie.list_documents() 110 | indexed_names = {doc['name'] for doc in existing_docs} 111 | 112 | for doc_meta in existing_docs: 113 | doc = await self.docpixie.get_document(doc_meta['id']) 114 | if doc: 115 | self.state_manager.add_document(doc) 116 | 117 | except Exception as e: 118 | indexed_names = set() 119 | chat_log.write(f"[dim]Note: Could not load existing documents: {e}[/dim]\\n") 120 | 121 | pdf_files = list(self.state_manager.documents_folder.glob("*.pdf")) 122 | 123 | if not pdf_files: 124 | # Auto-open the Document Manager when there are no PDFs yet 125 | await self.app.push_screen(DocumentManagerDialog( 126 | self.state_manager.documents_folder, 127 | self.docpixie 128 | )) 129 | return 130 | 131 | new_pdf_files = [ 132 | pdf for pdf in pdf_files 133 | if pdf.stem not in indexed_names 134 | ] 135 | 136 | if new_pdf_files: 137 | chat_log.write(f"[blue bold]●[/blue bold] Found {len(new_pdf_files)} new PDF file(s)\n") 138 | await self.app.push_screen(DocumentManagerDialog( 139 | self.state_manager.documents_folder, 140 | self.docpixie 141 | )) 142 | 143 | async def load_or_create_conversation(self) -> None: 144 | try: 145 | doc_ids = [doc.id for doc in self.state_manager.indexed_documents] 146 | last_conversation_id = self.state_manager.get_last_conversation_id() 147 | 148 | if last_conversation_id: 149 | if self.state_manager.load_conversation(last_conversation_id): 150 | status_label = self.app.query_one("#status-label") 151 | status_label.update(self.state_manager.get_status_text()) 152 | return 153 | 154 | self.state_manager.create_new_conversation() 155 | status_label = self.app.query_one("#status-label") 156 | status_label.update(self.state_manager.get_status_text()) 157 | 158 | except Exception as e: 159 | print(f"Error loading conversation: {e}") 160 | self.state_manager.set_current_conversation(None) 161 | 162 | async def process_query(self, query: str, task_callback: Optional[Callable] = None) -> None: 163 | chat_log = self.app.query_one("#chat-log", ChatArea) 164 | 165 | if not self.docpixie: 166 | chat_log.write("[error]❌ DocPixie not initialized[/error]\\n") 167 | return 168 | 169 | if not self.state_manager.has_documents(): 170 | chat_log.write("[warning]⚠️ No documents indexed yet. Use /documents to add and index documents first.[/warning]\\n") 171 | return 172 | 173 | self.state_manager.set_processing(True) 174 | 175 | try: 176 | chat_log.show_processing_status() 177 | 178 | result = await asyncio.get_event_loop().run_in_executor( 179 | None, 180 | self.docpixie.query_sync, 181 | query, 182 | None, # mode 183 | None, # document_ids 184 | None, # max_pages 185 | self.state_manager.conversation_history, 186 | task_callback 187 | ) 188 | 189 | chat_log.add_assistant_message(result.answer) 190 | 191 | if hasattr(result, 'get_pages_by_document'): 192 | pages_by_doc = result.get_pages_by_document() 193 | if pages_by_doc: 194 | chat_log.write("[dim]Analyzed documents:[/dim]\n") 195 | for doc_name, page_nums in pages_by_doc.items(): 196 | pages_str = ", ".join(str(p) for p in page_nums) 197 | chat_log.write(f"[dim] • {doc_name}: Pages {pages_str}[/dim]\n") 198 | elif hasattr(result, 'page_numbers') and result.page_numbers: 199 | chat_log.write(f"[dim]Analyzed pages: {result.page_numbers}[/dim]\n") 200 | 201 | if hasattr(result, 'processing_time') and result.processing_time > 0: 202 | chat_log.write(f"[dim]Processing time: {result.processing_time:.2f}s[/dim]\n") 203 | 204 | cost = getattr(result, 'total_cost', 0.0) or 0.0 205 | if cost < 0.01: 206 | chat_log.write(f"[dim]Cost: ${cost:.6f}[/dim]\n") 207 | else: 208 | chat_log.write(f"[dim]Cost: ${cost:.4f}[/dim]\n") 209 | 210 | chat_log.write("\n") 211 | 212 | self.state_manager.add_conversation_message( 213 | ConversationMessage(role="user", content=query) 214 | ) 215 | self.state_manager.add_conversation_message( 216 | ConversationMessage(role="assistant", content=result.answer, 217 | cost=getattr(result, 'total_cost', 0.0) or 0.0) 218 | ) 219 | 220 | self.state_manager.limit_conversation_history() 221 | self.state_manager.save_current_conversation() 222 | 223 | status_label = self.app.query_one("#status-label") 224 | status_label.update(self.state_manager.get_status_text()) 225 | 226 | except Exception as e: 227 | chat_log.write(f"[red bold]●[/red bold] Error: {e}\n\n") 228 | finally: 229 | self.state_manager.set_processing(False) 230 | 231 | def delete_document_sync(self, document_id: str) -> bool: 232 | if self.docpixie: 233 | try: 234 | return self.docpixie.delete_document_sync(document_id) 235 | except Exception: 236 | return False 237 | return False 238 | -------------------------------------------------------------------------------- /docpixie/cli/legacy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | DocPixie CLI - Interactive document chat interface 4 | """ 5 | 6 | import os 7 | import sys 8 | import asyncio 9 | from pathlib import Path 10 | from typing import List, Optional, Any, Dict 11 | from datetime import datetime 12 | 13 | from docpixie import DocPixie, ConversationMessage 14 | from docpixie.core.config import DocPixieConfig 15 | from docpixie.models.document import Document, QueryResult 16 | from docpixie.models.agent import TaskStatus 17 | 18 | 19 | class DocPixieCLI: 20 | """Command-line interface for DocPixie document chat""" 21 | 22 | def __init__(self): 23 | """Initialize the CLI application""" 24 | self.documents_folder = Path("./documents") 25 | self.docpixie: Optional[DocPixie] = None 26 | self.indexed_documents: List[Document] = [] 27 | self.conversation_history: List[ConversationMessage] = [] 28 | self.current_task_plan = None 29 | 30 | def initialize_docpixie(self) -> bool: 31 | """Initialize DocPixie with OpenRouter and in-memory storage""" 32 | try: 33 | api_key = os.getenv("OPENROUTER_API_KEY") 34 | if not api_key: 35 | print("❌ Error: OPENROUTER_API_KEY environment variable not set") 36 | print("Please set it with: export OPENROUTER_API_KEY='your-api-key'") 37 | return False 38 | 39 | config = DocPixieConfig( 40 | provider="openrouter", 41 | model="openai/gpt-5-mini", 42 | vision_model="openai/gpt-4.1", 43 | storage_type="memory", 44 | openrouter_api_key=api_key, 45 | jpeg_quality=85, 46 | max_pages_per_task=4 47 | ) 48 | 49 | self.docpixie = DocPixie(config=config) 50 | print("✅ DocPixie initialized with OpenRouter (Gemini 2.5 Flash)") 51 | return True 52 | 53 | except Exception as e: 54 | print(f"❌ Failed to initialize DocPixie: {e}") 55 | return False 56 | 57 | def scan_documents(self) -> List[Path]: 58 | """Scan the documents folder for PDF files""" 59 | if not self.documents_folder.exists(): 60 | self.documents_folder.mkdir(parents=True) 61 | print(f"📁 Created documents folder: {self.documents_folder.absolute()}") 62 | 63 | pdf_files = list(self.documents_folder.glob("*.pdf")) 64 | 65 | if not pdf_files: 66 | print(f"📭 No PDF files found in {self.documents_folder.absolute()}") 67 | print("Please add PDF files to the documents folder and restart the program.") 68 | return [] 69 | 70 | print(f"\n📚 Found {len(pdf_files)} PDF file(s):") 71 | for i, pdf in enumerate(pdf_files, 1): 72 | print(f" {i}. {pdf.name}") 73 | 74 | return pdf_files 75 | 76 | def index_documents(self, pdf_files: List[Path]) -> bool: 77 | """Index all PDF documents""" 78 | if not pdf_files: 79 | return False 80 | 81 | print(f"\n🔄 Starting document indexing...") 82 | 83 | for i, pdf_file in enumerate(pdf_files, 1): 84 | try: 85 | print(f"\n📄 Processing ({i}/{len(pdf_files)}): {pdf_file.name}") 86 | 87 | document = self.docpixie.add_document_sync( 88 | file_path=str(pdf_file), 89 | document_name=pdf_file.stem 90 | ) 91 | 92 | self.indexed_documents.append(document) 93 | print(f" ✅ Indexed: {document.page_count} pages") 94 | 95 | if document.summary: 96 | print(f" 📝 Summary: {document.summary[:100]}...") 97 | 98 | except Exception as e: 99 | print(f" ❌ Failed to index {pdf_file.name}: {e}") 100 | continue 101 | 102 | successful = len(self.indexed_documents) 103 | if successful > 0: 104 | print(f"\n✅ Successfully indexed {successful}/{len(pdf_files)} document(s)") 105 | return True 106 | else: 107 | print(f"\n❌ Failed to index any documents") 108 | return False 109 | 110 | def display_welcome_message(self): 111 | """Display welcome message and instructions""" 112 | print("\n" + "="*60) 113 | print("🧚 DocPixie Chat Interface") 114 | print("="*60) 115 | print("\nYou can now chat with your documents!") 116 | print("Commands:") 117 | print(" /new - Start a new conversation") 118 | print(" /exit - Exit the program") 119 | print(" Ctrl+C - Force exit") 120 | print("\n" + "-"*60) 121 | 122 | def format_answer(self, result: QueryResult) -> str: 123 | """Format the query result for display""" 124 | output = [] 125 | 126 | # Add the answer 127 | output.append(f"\n🤖 Assistant: {result.answer}") 128 | 129 | # Add metadata if available 130 | if hasattr(result, 'get_pages_by_document'): 131 | pages_by_doc = result.get_pages_by_document() 132 | if pages_by_doc: 133 | output.append("\n📄 Analyzed documents:") 134 | for doc_name, page_nums in pages_by_doc.items(): 135 | pages_str = ", ".join(str(p) for p in page_nums) 136 | output.append(f" • {doc_name}: Pages {pages_str}") 137 | elif result.page_numbers: 138 | output.append(f"\n📄 Analyzed pages: {result.page_numbers}") 139 | 140 | if result.confidence > 0: 141 | confidence_pct = int(result.confidence * 100) 142 | output.append(f"💡 Confidence: {confidence_pct}%") 143 | 144 | if result.processing_time > 0: 145 | output.append(f"⏱️ Processing time: {result.processing_time:.2f}s") 146 | 147 | return "\n".join(output) 148 | 149 | def display_task_plan(self, plan, action="Current"): 150 | """Display the current task plan in a formatted way""" 151 | print("\n" + "="*60) 152 | print(f"📋 {action} Task Plan:") 153 | print("="*60) 154 | 155 | for task in plan.tasks: 156 | if task.status == TaskStatus.COMPLETED: 157 | icon = "✅" 158 | elif task.status == TaskStatus.IN_PROGRESS: 159 | icon = "⏳" 160 | else: 161 | icon = "⏸️ " 162 | 163 | doc_info = "" 164 | if task.document: 165 | doc = next((d for d in self.indexed_documents if d.id == task.document), None) 166 | if doc: 167 | doc_info = f" [{doc.name}]" 168 | 169 | print(f" {icon} {task.name}{doc_info}") 170 | if task.description: 171 | print(f" {task.description}") 172 | 173 | print("=" * 60) 174 | 175 | def display_task_update(self, event_type: str, data: Any): 176 | """Display task plan updates as they happen""" 177 | if event_type == 'plan_created': 178 | self.current_task_plan = data 179 | self.display_task_plan(data, "Initial") 180 | 181 | elif event_type == 'task_started': 182 | task = data['task'] 183 | plan = data['plan'] 184 | self.current_task_plan = plan 185 | 186 | doc_info = "" 187 | if task.document: 188 | doc = next((d for d in self.indexed_documents if d.id == task.document), None) 189 | if doc: 190 | doc_info = f" in {doc.name}" 191 | 192 | print(f"\n🔄 Starting task: {task.name}{doc_info}") 193 | 194 | elif event_type == 'pages_selected': 195 | task = data['task'] 196 | page_numbers = data['page_numbers'] 197 | 198 | if page_numbers: 199 | pages_str = ", ".join(str(p) for p in page_numbers) 200 | print(f" 📑 Selected pages: {pages_str}") 201 | else: 202 | print(f" 📑 No relevant pages found") 203 | 204 | elif event_type == 'task_completed': 205 | task = data['task'] 206 | result = data['result'] 207 | plan = data['plan'] 208 | self.current_task_plan = plan 209 | 210 | pages_analyzed = len(result.selected_pages) if hasattr(result, 'selected_pages') else 0 211 | print(f" ✅ Completed ({pages_analyzed} pages analyzed)") 212 | 213 | elif event_type == 'plan_updated': 214 | self.current_task_plan = data 215 | print("\n🔧 Task plan updated based on findings:") 216 | self.display_task_plan(data, "Updated") 217 | 218 | async def task_update_callback(self, event_type: str, data: Any): 219 | """Async callback for task updates""" 220 | self.display_task_update(event_type, data) 221 | 222 | def chat_loop(self): 223 | """Main chat interaction loop""" 224 | self.display_welcome_message() 225 | 226 | while True: 227 | try: 228 | user_input = input("\n👤 You: ").strip() 229 | 230 | if not user_input: 231 | continue 232 | 233 | if user_input.lower() == "/exit": 234 | print("\n👋 Goodbye!") 235 | break 236 | 237 | if user_input.lower() == "/new": 238 | self.conversation_history = [] 239 | print("\n🔄 Started new conversation") 240 | continue 241 | 242 | print("\n⏳ Processing query...") 243 | 244 | result = self.docpixie.query_sync( 245 | question=user_input, 246 | conversation_history=self.conversation_history, 247 | task_update_callback=self.task_update_callback 248 | ) 249 | 250 | print(self.format_answer(result)) 251 | 252 | self.conversation_history.append( 253 | ConversationMessage(role="user", content=user_input) 254 | ) 255 | self.conversation_history.append( 256 | ConversationMessage(role="assistant", content=result.answer) 257 | ) 258 | 259 | if len(self.conversation_history) > 20: 260 | self.conversation_history = self.conversation_history[-20:] 261 | 262 | except KeyboardInterrupt: 263 | print("\n\n👋 Interrupted. Goodbye!") 264 | break 265 | 266 | except Exception as e: 267 | print(f"\n❌ Error: {e}") 268 | print("Please try again or use /new to start fresh.") 269 | 270 | def run(self): 271 | """Main entry point for the CLI application""" 272 | print("\n🧚 DocPixie CLI - Document Chat Interface") 273 | print("="*60) 274 | 275 | if not self.initialize_docpixie(): 276 | return 1 277 | 278 | pdf_files = self.scan_documents() 279 | if not pdf_files: 280 | return 1 281 | 282 | print(f"\n❓ Index these {len(pdf_files)} document(s)? (y/n): ", end="") 283 | response = input().strip().lower() 284 | 285 | if response != 'y': 286 | print("📭 Indexing cancelled") 287 | return 0 288 | 289 | if not self.index_documents(pdf_files): 290 | return 1 291 | 292 | self.chat_loop() 293 | 294 | return 0 295 | 296 | 297 | def main(): 298 | """Main entry point""" 299 | cli = DocPixieCLI() 300 | sys.exit(cli.run()) 301 | 302 | 303 | if __name__ == "__main__": 304 | main() 305 | -------------------------------------------------------------------------------- /docpixie/cli/conversation_storage.py: -------------------------------------------------------------------------------- 1 | """ 2 | Local conversation storage for DocPixie CLI 3 | Stores conversations per project directory 4 | """ 5 | 6 | import json 7 | import uuid 8 | import os 9 | from pathlib import Path 10 | from typing import List, Dict, Any, Optional 11 | from datetime import datetime 12 | from dataclasses import dataclass, asdict 13 | 14 | from docpixie.models.agent import ConversationMessage 15 | 16 | 17 | @dataclass 18 | class ConversationMetadata: 19 | """Metadata for a conversation""" 20 | id: str 21 | name: str 22 | working_directory: str 23 | created_at: str 24 | updated_at: str 25 | message_count: int 26 | indexed_documents: List[str] 27 | total_cost: float = 0.0 28 | 29 | 30 | class ConversationStorage: 31 | """Manages local conversation storage in ./.docpixie/conversations/""" 32 | 33 | def __init__(self): 34 | """Initialize conversation storage for current directory""" 35 | self.base_path = Path("./.docpixie") 36 | self.conversations_dir = self.base_path / "conversations" 37 | self.metadata_file = self.conversations_dir / "metadata.json" 38 | 39 | self.conversations_dir.mkdir(parents=True, exist_ok=True) 40 | 41 | self.working_directory = str(Path.cwd().resolve()) 42 | 43 | self.current_conversation_id: Optional[str] = None 44 | 45 | self._load_metadata() 46 | 47 | def _load_metadata(self) -> Dict[str, ConversationMetadata]: 48 | """Load conversation metadata from file""" 49 | if not self.metadata_file.exists(): 50 | return {} 51 | 52 | try: 53 | with open(self.metadata_file, 'r') as f: 54 | data = json.load(f) 55 | 56 | metadata = {} 57 | for conv_id, conv_data in data.items(): 58 | if 'total_cost' not in conv_data: 59 | conv_data['total_cost'] = 0.0 60 | metadata[conv_id] = ConversationMetadata(**conv_data) 61 | 62 | return metadata 63 | except Exception as e: 64 | print(f"Warning: Failed to load conversation metadata: {e}") 65 | return {} 66 | 67 | def _save_metadata(self, metadata: Dict[str, ConversationMetadata]): 68 | """Save conversation metadata to file""" 69 | try: 70 | data = {} 71 | for conv_id, conv_meta in metadata.items(): 72 | data[conv_id] = asdict(conv_meta) 73 | 74 | with open(self.metadata_file, 'w') as f: 75 | json.dump(data, f, indent=2) 76 | except Exception as e: 77 | print(f"Error saving conversation metadata: {e}") 78 | 79 | def _conversation_file_path(self, conversation_id: str) -> Path: 80 | """Get path for conversation file""" 81 | return self.conversations_dir / f"{conversation_id}.json" 82 | 83 | def _generate_conversation_name(self, messages: List[ConversationMessage]) -> str: 84 | """Generate a conversation name from the first user message""" 85 | if not messages: 86 | return f"Chat {datetime.now().strftime('%Y-%m-%d %H:%M')}" 87 | 88 | first_user_message = None 89 | for msg in messages: 90 | if msg.role == "user": 91 | first_user_message = msg 92 | break 93 | 94 | if first_user_message: 95 | name = first_user_message.content.strip()[:50] 96 | if len(first_user_message.content) > 50: 97 | name += "..." 98 | return name 99 | else: 100 | return f"Chat {datetime.now().strftime('%Y-%m-%d %H:%M')}" 101 | 102 | def create_new_conversation(self, indexed_documents: List[str] = None) -> str: 103 | """Create a new conversation and return its ID""" 104 | conversation_id = str(uuid.uuid4()) 105 | now = datetime.now().isoformat() 106 | 107 | metadata = ConversationMetadata( 108 | id=conversation_id, 109 | name="New Chat", 110 | working_directory=self.working_directory, 111 | created_at=now, 112 | updated_at=now, 113 | message_count=0, 114 | indexed_documents=indexed_documents or [], 115 | total_cost=0.0 116 | ) 117 | 118 | conversation_data = { 119 | "id": conversation_id, 120 | "metadata": asdict(metadata), 121 | "messages": [] 122 | } 123 | 124 | conversation_file = self._conversation_file_path(conversation_id) 125 | with open(conversation_file, 'w') as f: 126 | json.dump(conversation_data, f, indent=2) 127 | 128 | all_metadata = self._load_metadata() 129 | all_metadata[conversation_id] = metadata 130 | self._save_metadata(all_metadata) 131 | 132 | self.current_conversation_id = conversation_id 133 | return conversation_id 134 | 135 | def save_conversation(self, conversation_id: str, messages: List[ConversationMessage], 136 | indexed_documents: List[str] = None): 137 | """Save conversation messages""" 138 | try: 139 | now = datetime.now().isoformat() 140 | 141 | messages_data = [] 142 | total_cost = 0.0 143 | for msg in messages: 144 | msg_dict = { 145 | "role": msg.role, 146 | "content": msg.content, 147 | "timestamp": msg.timestamp.isoformat() 148 | } 149 | msg_cost = getattr(msg, 'cost', 0.0) or 0.0 150 | msg_dict["cost"] = msg_cost 151 | total_cost += msg_cost 152 | messages_data.append(msg_dict) 153 | 154 | all_metadata = self._load_metadata() 155 | if conversation_id in all_metadata: 156 | conv_metadata = all_metadata[conversation_id] 157 | conv_metadata.updated_at = now 158 | conv_metadata.message_count = len(messages) 159 | conv_metadata.total_cost = total_cost 160 | if indexed_documents is not None: 161 | conv_metadata.indexed_documents = indexed_documents 162 | 163 | if conv_metadata.name == "New Chat" and messages: 164 | conv_metadata.name = self._generate_conversation_name(messages) 165 | else: 166 | conv_metadata = ConversationMetadata( 167 | id=conversation_id, 168 | name=self._generate_conversation_name(messages), 169 | working_directory=self.working_directory, 170 | created_at=now, 171 | updated_at=now, 172 | message_count=len(messages), 173 | indexed_documents=indexed_documents or [], 174 | total_cost=total_cost 175 | ) 176 | all_metadata[conversation_id] = conv_metadata 177 | 178 | conversation_data = { 179 | "id": conversation_id, 180 | "metadata": asdict(conv_metadata), 181 | "messages": messages_data 182 | } 183 | 184 | conversation_file = self._conversation_file_path(conversation_id) 185 | with open(conversation_file, 'w') as f: 186 | json.dump(conversation_data, f, indent=2) 187 | 188 | self._save_metadata(all_metadata) 189 | 190 | except Exception as e: 191 | print(f"Error saving conversation: {e}") 192 | 193 | def load_conversation(self, conversation_id: str) -> Optional[tuple[ConversationMetadata, List[ConversationMessage]]]: 194 | """Load conversation by ID""" 195 | try: 196 | conversation_file = self._conversation_file_path(conversation_id) 197 | if not conversation_file.exists(): 198 | return None 199 | 200 | with open(conversation_file, 'r') as f: 201 | data = json.load(f) 202 | 203 | metadata = ConversationMetadata(**data["metadata"]) 204 | 205 | messages = [] 206 | for msg_data in data["messages"]: 207 | message = ConversationMessage( 208 | role=msg_data["role"], 209 | content=msg_data["content"], 210 | timestamp=datetime.fromisoformat(msg_data["timestamp"]), 211 | cost=msg_data.get("cost", 0.0) 212 | ) 213 | messages.append(message) 214 | 215 | self.current_conversation_id = conversation_id 216 | return metadata, messages 217 | 218 | except Exception as e: 219 | print(f"Error loading conversation: {e}") 220 | return None 221 | 222 | def list_local_conversations(self) -> List[ConversationMetadata]: 223 | """List conversations from current working directory only""" 224 | all_metadata = self._load_metadata() 225 | 226 | local_conversations = [] 227 | for conv_id, metadata in all_metadata.items(): 228 | if metadata.working_directory == self.working_directory: 229 | local_conversations.append(metadata) 230 | 231 | local_conversations.sort(key=lambda x: x.updated_at, reverse=True) 232 | return local_conversations 233 | 234 | def delete_conversation(self, conversation_id: str) -> bool: 235 | """Delete a conversation""" 236 | try: 237 | conversation_file = self._conversation_file_path(conversation_id) 238 | if conversation_file.exists(): 239 | conversation_file.unlink() 240 | 241 | all_metadata = self._load_metadata() 242 | if conversation_id in all_metadata: 243 | del all_metadata[conversation_id] 244 | self._save_metadata(all_metadata) 245 | 246 | if self.current_conversation_id == conversation_id: 247 | self.current_conversation_id = None 248 | 249 | return True 250 | except Exception as e: 251 | print(f"Error deleting conversation: {e}") 252 | return False 253 | 254 | def rename_conversation(self, conversation_id: str, new_name: str) -> bool: 255 | """Rename a conversation""" 256 | try: 257 | all_metadata = self._load_metadata() 258 | if conversation_id not in all_metadata: 259 | return False 260 | 261 | all_metadata[conversation_id].name = new_name 262 | all_metadata[conversation_id].updated_at = datetime.now().isoformat() 263 | 264 | conversation_file = self._conversation_file_path(conversation_id) 265 | if conversation_file.exists(): 266 | with open(conversation_file, 'r') as f: 267 | data = json.load(f) 268 | 269 | data["metadata"]["name"] = new_name 270 | data["metadata"]["updated_at"] = all_metadata[conversation_id].updated_at 271 | 272 | with open(conversation_file, 'w') as f: 273 | json.dump(data, f, indent=2) 274 | 275 | self._save_metadata(all_metadata) 276 | return True 277 | 278 | except Exception as e: 279 | print(f"Error renaming conversation: {e}") 280 | return False 281 | 282 | def get_last_conversation(self) -> Optional[str]: 283 | """Get the most recently updated conversation ID from current directory""" 284 | conversations = self.list_local_conversations() 285 | if conversations: 286 | return conversations[0].id 287 | return None --------------------------------------------------------------------------------