├── .cursorrules ├── .flake8 ├── .github └── pull_request_template.md ├── .gitignore ├── .pre-commit-config.yaml ├── CHANGELOG.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── THIRD_PARTY_LICENSES.md ├── TODO.md ├── commit_message.txt ├── content ├── extracted │ └── .include ├── log │ └── .include ├── source │ └── .include └── test │ └── source │ ├── test.docx │ ├── test.html │ ├── test.jpeg │ ├── test.pdf │ └── test.pptx ├── docs ├── api.md ├── api │ ├── file_extractor.md │ └── image_description.md ├── cli │ ├── file_extractor.md │ └── image_description.md ├── getting_started.md └── testing.md ├── examples ├── README.md ├── basic_extraction.py ├── batch_processing.py ├── custom_prompts.py └── example_data │ └── README.md ├── pyproject.toml ├── pytest.ini ├── pyvisionai ├── __init__.py ├── cli │ ├── __init__.py │ ├── describe_image.py │ └── extract.py ├── config │ └── html_config.py ├── core │ ├── __init__.py │ ├── extractor.py │ └── factory.py ├── describers │ ├── __init__.py │ ├── base.py │ ├── claude.py │ ├── ollama.py │ └── openai.py ├── extractors │ ├── __init__.py │ ├── base.py │ ├── docx.py │ ├── docx_page.py │ ├── html │ │ └── browser.py │ ├── html_page.py │ ├── pdf.py │ ├── pdf_page.py │ ├── pptx.py │ └── pptx_page.py └── utils │ ├── __init__.py │ ├── benchmark.py │ ├── config.py │ ├── logger.py │ └── retry.py └── tests ├── __init__.py ├── conftest.py ├── core └── test_extractor.py ├── data ├── chart.png ├── charts.pptx ├── report.docx ├── sample.docx ├── sample.pdf ├── sample.pptx ├── sample_image.jpg └── technical_doc.pdf ├── describers ├── test_api_retry.py ├── test_base.py ├── test_claude.py ├── test_ollama.py └── test_openai.py ├── extractors └── test_pptx.py ├── file_extraction ├── test_extraction_cli.py └── test_extraction_lib.py ├── test_batch_processing.py ├── test_benchmarks.py ├── test_cli.py ├── test_custom_prompts.py ├── test_examples.py ├── test_extraction_cli.py ├── test_extraction_lib.py ├── test_image_description.py ├── test_integration.py └── utils ├── __init__.py ├── conftest.py ├── metrics.py ├── test_retry.py └── verifiers.py /.cursorrules: -------------------------------------------------------------------------------- 1 | You are an expert in Python, FastAPI, microservices architecture, and serverless environments. 2 | 3 | Advanced Principles 4 | - Design services to be stateless; leverage external storage and caches (e.g., Redis) for state persistence. 5 | - Implement API gateways and reverse proxies (e.g., NGINX, Traefik) for handling traffic to microservices. 6 | - Use circuit breakers and retries for resilient service communication. 7 | - Favor serverless deployment for reduced infrastructure overhead in scalable environments. 8 | - Use asynchronous workers (e.g., Celery, RQ) for handling background tasks efficiently. 9 | 10 | Clean Architecture and Domain-Driven Design (DDD) 11 | - Enforce **Clean Architecture principles** by separating concerns into **layers (Domain, Application, Infrastructure, Presentation)**. 12 | - Use **Dependency Inversion** to abstract external providers (DB, cache, third-party APIs). 13 | - Ensure the **Domain Layer remains pure**, containing business rules without dependencies on external systems. 14 | - Apply **Domain-Driven Design (DDD)** as a **core** principle, ensuring entities, value objects, and aggregates are well-defined. 15 | - Avoid business logic in controllers or infrastructure layers—use **Application Services** for orchestration. 16 | 17 | CQRS (Command Query Responsibility Segregation) 18 | - **Separate read and write operations** to reduce coupling and optimize performance. 19 | - Implement **Query Handlers** for efficient data retrieval. 20 | - Use **Command Handlers** to process changes without affecting read-side models. 21 | - Consider **Event Sourcing** where applicable to maintain an audit log of state changes. 22 | 23 | Microservices and API Gateway Integration 24 | - Integrate FastAPI services with API Gateway solutions like Kong or AWS API Gateway. 25 | - Use API Gateway for rate limiting, request transformation, and security filtering. 26 | - Design APIs with clear separation of concerns to align with microservices principles. 27 | - Implement inter-service communication using message brokers (e.g., RabbitMQ, Kafka) for event-driven architectures. 28 | 29 | Serverless and Cloud-Native Patterns 30 | - Optimize FastAPI apps for serverless environments (e.g., AWS Lambda, Azure Functions) by minimizing cold start times. 31 | - Package FastAPI applications using lightweight containers or as a standalone binary for deployment in serverless setups. 32 | - Use managed services (e.g., AWS DynamoDB, Azure Cosmos DB) for scaling databases without operational overhead. 33 | - Implement automatic scaling with serverless functions to handle variable loads effectively. 34 | 35 | Advanced Middleware and Security 36 | - Implement custom middleware for detailed logging, tracing, and monitoring of API requests. 37 | - Use OpenTelemetry or similar libraries for distributed tracing in microservices architectures. 38 | - Apply security best practices: OAuth2 for secure API access, rate limiting, and DDoS protection. 39 | - Use security headers (e.g., CORS, CSP) and implement content validation using tools like OWASP Zap. 40 | 41 | Optimizing for Performance and Scalability 42 | - Leverage FastAPI's async capabilities for handling large volumes of simultaneous connections efficiently. 43 | - Optimize backend services for high throughput and low latency; use databases optimized for read-heavy workloads (e.g., Elasticsearch). 44 | - Use caching layers (e.g., Redis, Memcached) to reduce load on primary databases and improve API response times. 45 | - Apply load balancing and service mesh technologies (e.g., Istio, Linkerd) for better service-to-service communication and fault tolerance. 46 | 47 | Monitoring and Logging 48 | - Use Prometheus and Grafana for monitoring FastAPI applications and setting up alerts. 49 | - Implement structured logging for better log analysis and observability. 50 | - Integrate with centralized logging systems (e.g., ELK Stack, AWS CloudWatch) for aggregated logging and monitoring. 51 | 52 | Key Conventions 53 | 1. Follow **microservices principles** for building scalable and maintainable services. 54 | 2. Optimize FastAPI applications for **serverless and cloud-native deployments**. 55 | 3. Apply **Clean Architecture, DDD, and CQRS** to ensure **scalability, maintainability, and business logic purity**. 56 | 4. Use **security, monitoring, and performance optimization** techniques to build robust, performant APIs. 57 | 5. **Keep It Simple** 58 | Above all, prioritize simplicity and only apply the rules necessary for the use case. 59 | - *Example:* When you might be tempted to set up a complex event-driven pipeline, first consider whether a simpler, synchronous solution meets the immediate needs. 60 | 6. **Reasoning Approach** 61 | Avoid starting with a fixed conclusion. Begin with some doubt, explore multiple possibilities, 62 | investigate thoroughly, and only make a final conclusion once sufficient evidence and analysis 63 | have been considered. 64 | 7. **@Web Usage** 65 | The model is encouraged to use any relevant web references discovered (via `@Web`) at any time 66 | it finds fit, without waiting for explicit user permission. This helps enrich responses with 67 | properly cited sources. 68 | 69 | Refer to FastAPI, microservices, serverless, and Clean Architecture documentation for best practices and advanced usage patterns. 70 | 71 | PyVisionAI Project-Specific Guidelines 72 | - Document Processing Architecture 73 | - Maintain clear separation between document processors (PDF, DOCX, PPTX, HTML) 74 | - Use Strategy pattern for different extraction methods (text_and_images, page_as_image) 75 | - Implement Factory pattern for Vision Model providers (OpenAI, Ollama) 76 | - Keep extraction logic independent of vision model implementation 77 | 78 | - Vision Model Integration 79 | - Abstract vision model interfaces through base classes 80 | - Support both cloud (OpenAI) and local (Ollama) models 81 | - Implement proper retry mechanisms for API calls 82 | - Handle model-specific configuration and requirements 83 | 84 | - Performance Optimization 85 | - Use parallel processing for document extraction where appropriate 86 | - Implement proper resource cleanup for large documents 87 | - Optimize image processing for memory efficiency 88 | - Cache processed results when beneficial 89 | 90 | - CLI Design 91 | - Follow consistent parameter naming across commands 92 | - Provide clear, helpful error messages 93 | - Support both simple and advanced usage patterns 94 | - Maintain backward compatibility in parameter changes 95 | 96 | - Testing Strategy 97 | - Use fixtures for document processing tests 98 | - Mock external API calls in vision model tests 99 | - Implement proper cleanup of test artifacts 100 | - Maintain high test coverage (>80%) 101 | 102 | - Package Distribution 103 | - Support multiple installation methods (pip, poetry, homebrew) 104 | - Properly handle system dependencies 105 | - Maintain clear version compatibility requirements 106 | - Document all installation methods thoroughly 107 | 108 | Test-Driven Development (TDD) Rules 109 | - **NEVER modify production code while writing or fixing tests** 110 | - Tests must be written to match the current production behavior 111 | - If tests fail, document the failures and create separate tasks to fix production code 112 | - Follow strict Red-Green-Refactor cycle: write failing test first, then fix production code 113 | - Keep test code and production code changes in separate commits 114 | - Test files should mirror the structure of the production code 115 | - Tests should be independent and not rely on other tests' state 116 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | # Select only critical error types 3 | select = 4 | # Runtime errors 5 | E9, 6 | # Invalid syntax 7 | F63, 8 | # Syntax errors 9 | F7, 10 | # Undefined variables 11 | F82, 12 | # Bare except clauses 13 | E722 14 | 15 | # Ignore non-critical issues 16 | ignore = 17 | # Line too long 18 | E501, 19 | # Unused imports 20 | F401, 21 | # Line break before operator 22 | W503 23 | 24 | # Exclude directories 25 | exclude = 26 | # Git directory 27 | .git, 28 | # Virtual environments 29 | venv 30 | 31 | # Maximum complexity 32 | max-complexity = 15 33 | 34 | per-file-ignores = 35 | __init__.py:F401 36 | tests/*:F401 37 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | # Description 2 | 3 | Please include a summary of the changes and which issue is fixed. Include relevant motivation and context. 4 | 5 | Fixes # (issue) 6 | 7 | ## Type of change 8 | 9 | Please delete options that are not relevant. 10 | 11 | - [ ] Bug fix (non-breaking change which fixes an issue) 12 | - [ ] New feature (non-breaking change which adds functionality) 13 | - [ ] Breaking change (fix or feature that would cause existing functionality to not work as expected) 14 | - [ ] Documentation update 15 | - [ ] Performance improvement 16 | 17 | ## How Has This Been Tested? 18 | 19 | Please describe the tests that you ran to verify your changes: 20 | 21 | 1. Test A 22 | 2. Test B 23 | 3. ... 24 | 25 | ## Checklist: 26 | 27 | - [ ] My code follows the style guidelines of this project 28 | - [ ] I have performed a self-review of my own code 29 | - [ ] I have commented my code, particularly in hard-to-understand areas 30 | - [ ] I have made corresponding changes to the documentation 31 | - [ ] My changes generate no new warnings 32 | - [ ] I have added tests that prove my fix is effective or that my feature works 33 | - [ ] New and existing unit tests pass locally with my changes 34 | - [ ] Any dependent changes have been merged and published in downstream modules 35 | - [ ] I have updated the CHANGELOG.md file 36 | 37 | ## Additional context 38 | 39 | Add any other context or screenshots about the pull request here. 40 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | *.so 6 | .Python 7 | build/ 8 | dist/ 9 | *.egg-info/ 10 | 11 | # IDE 12 | .idea/ 13 | .vscode/ 14 | *.swp 15 | *.swo 16 | 17 | # Environment 18 | .env 19 | env/ 20 | venv/ 21 | .python-version 22 | *.lock 23 | poetry.lock 24 | 25 | # OS 26 | .DS_Store 27 | Thumbs.db 28 | 29 | # Project directories 30 | content/source/* 31 | !content/source/.include 32 | content/extracted/* 33 | !content/extracted/.include 34 | content/log/* 35 | !content/log/.include 36 | 37 | # Test files 38 | content/test/* 39 | !content/test/ 40 | !content/test/source/ 41 | !content/test/source/* 42 | content/test/output/* 43 | !content/test/output/.include 44 | 45 | # Logs 46 | *.log 47 | 48 | # Coverage 49 | .coverage 50 | coverage.xml 51 | htmlcov/ 52 | .pytest_cache/ 53 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.5.0 4 | hooks: 5 | - id: trailing-whitespace 6 | - id: end-of-file-fixer 7 | - id: check-yaml 8 | - id: check-added-large-files 9 | 10 | - repo: https://github.com/psf/black 11 | rev: 24.2.0 12 | hooks: 13 | - id: black 14 | language_version: python3.12 15 | 16 | - repo: https://github.com/pycqa/isort 17 | rev: 5.13.2 18 | hooks: 19 | - id: isort 20 | args: ["--profile", "black"] 21 | 22 | - repo: https://github.com/pycqa/flake8 23 | rev: 7.0.0 24 | hooks: 25 | - id: flake8 26 | additional_dependencies: [flake8-pyproject] 27 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to PyVisionAI 2 | 3 | Thank you for your interest in contributing to PyVisionAI! This document provides guidelines and instructions for contributing to the project. 4 | 5 | ## Development Setup 6 | 7 | 1. **Clone the repository** 8 | ```bash 9 | git clone https://github.com/MDGrey33/pyvisionai.git 10 | cd pyvisionai 11 | ``` 12 | 13 | 2. **Set up environment variables and services** 14 | ```bash 15 | # For GPT-4 Vision development 16 | export OPENAI_API_KEY='your-openai-key' 17 | 18 | # For Claude Vision development 19 | export ANTHROPIC_API_KEY='your-anthropic-key' 20 | 21 | # For local Llama development 22 | export OLLAMA_HOST='http://localhost:11434' # Optional, this is the default 23 | 24 | # Install and start Ollama (macOS) 25 | brew install ollama 26 | ollama serve & # Run in background 27 | 28 | # Or install Ollama (Linux) 29 | curl -fsSL https://ollama.com/install.sh | sh 30 | ollama serve & # Run in background 31 | 32 | # Pull required model for development 33 | ollama pull llama3.2-vision 34 | 35 | # Verify Ollama setup 36 | ollama list # Should show llama3.2-vision 37 | curl http://localhost:11434/api/tags # Should return JSON response 38 | ``` 39 | 40 | Note: For Windows development, download Ollama from https://ollama.com/download/windows 41 | and run it as a service. 42 | 43 | 3. **Set up Python environment** 44 | ```bash 45 | # Create virtual environment 46 | python -m venv venv 47 | source venv/bin/activate # Linux/macOS 48 | # or 49 | .\venv\Scripts\activate # Windows 50 | 51 | # Install dependencies for development 52 | pip install -e . 53 | pip install -r requirements-dev.txt 54 | ``` 55 | 56 | 4. **Install system dependencies** 57 | ```bash 58 | # macOS 59 | brew install --cask libreoffice 60 | brew install poppler 61 | 62 | # Ubuntu/Debian 63 | sudo apt-get install -y libreoffice poppler-utils 64 | 65 | # Windows 66 | # Install LibreOffice and Poppler manually 67 | ``` 68 | 69 | 5. **Install development tools** 70 | ```bash 71 | # Install pre-commit hooks 72 | pre-commit install 73 | ``` 74 | 75 | ## Code Style 76 | 77 | We use several tools to maintain code quality: 78 | 79 | 1. **Black** for code formatting 80 | - Line length: 72 characters 81 | - Target version: Python 3.12 82 | ```bash 83 | poetry run black . 84 | ``` 85 | 86 | 2. **isort** for import sorting 87 | - Compatible with Black 88 | - Line length: 72 characters 89 | ```bash 90 | poetry run isort . 91 | ``` 92 | 93 | 3. **Flake8** for style guide enforcement 94 | ```bash 95 | poetry run flake8 96 | ``` 97 | 98 | 4. **pydocstyle** for docstring checking 99 | - Following Google style 100 | ```bash 101 | poetry run pydocstyle 102 | ``` 103 | 104 | ## Testing 105 | 106 | 1. **Environment Setup** 107 | ```bash 108 | # Required for full test coverage 109 | export OPENAI_API_KEY='your-openai-key' 110 | export ANTHROPIC_API_KEY='your-anthropic-key' 111 | ``` 112 | 113 | 2. **Running Tests** 114 | ```bash 115 | # Run all tests 116 | pytest 117 | 118 | # Run specific test categories 119 | pytest tests/test_extractors/ # Test extractors 120 | pytest tests/test_describers/ # Test vision models 121 | pytest tests/test_cli.py # Test CLI interface 122 | 123 | # Run tests for specific models 124 | pytest -k "test_gpt4" # Test GPT-4 Vision 125 | pytest -k "test_claude" # Test Claude Vision 126 | pytest -k "test_llama" # Test Llama Vision 127 | ``` 128 | 129 | Note: Tests requiring API keys will be skipped if the corresponding environment variable is not set. 130 | 131 | 2. **Writing tests** 132 | - Place tests in the `tests/` directory 133 | - Match test file names with source files 134 | - Use descriptive test names 135 | - Include both success and failure cases 136 | - Mock external services appropriately 137 | 138 | Example test: 139 | ```python 140 | def test_pdf_extraction(): 141 | """Test PDF content extraction.""" 142 | extractor = create_extractor("pdf") 143 | result = extractor.extract("tests/data/sample.pdf", "tests/output") 144 | assert os.path.exists(result) 145 | assert result.endswith(".md") 146 | ``` 147 | 148 | ## Pull Request Process 149 | 150 | 1. **Fork the repository** 151 | - Create a fork on GitHub 152 | - Clone your fork locally 153 | 154 | 2. **Create a branch** 155 | ```bash 156 | git checkout -b feature/your-feature-name 157 | # or 158 | git checkout -b fix/your-fix-name 159 | ``` 160 | 161 | 3. **Make your changes** 162 | - Follow the code style guidelines 163 | - Add/update tests as needed 164 | - Update documentation if required 165 | 166 | 4. **Commit your changes** 167 | - Use meaningful commit messages 168 | - Follow conventional commits format: 169 | ``` 170 | type(scope): description 171 | 172 | [optional body] 173 | 174 | [optional footer] 175 | ``` 176 | - Types: feat, fix, docs, style, refactor, test, chore 177 | 178 | 5. **Push and create PR** 179 | ```bash 180 | git push origin your-branch-name 181 | ``` 182 | Then: 183 | - Go to the repository on GitHub 184 | - Click "Pull Request" 185 | - Fill out the PR template completely 186 | - Link related issues 187 | - Request review from maintainers 188 | 189 | 6. **Review process** 190 | - Integration checks must pass 191 | - At least one maintainer review required 192 | - Address review comments 193 | - Keep PR focused and reasonable in size 194 | 195 | ## Documentation 196 | 197 | When adding new features or making changes: 198 | 199 | 1. **Update API documentation** 200 | - Add/update docstrings 201 | - Update `docs/api.md` if needed 202 | 203 | 2. **Update examples** 204 | - Add example code in `examples/` 205 | - Update example documentation 206 | 207 | 3. **Update guides** 208 | - Update relevant sections in guides 209 | - Add new guides if needed 210 | 211 | ## Release Process 212 | 213 | 1. **Version bumping** 214 | ```bash 215 | poetry version patch # or minor, or major 216 | ``` 217 | 218 | 2. **Update CHANGELOG.md** 219 | - Add version section 220 | - List all changes 221 | - Credit contributors 222 | 223 | 3. **Create release PR** 224 | - Update version in pyproject.toml 225 | - Update documentation 226 | - Run all tests 227 | 228 | ## Getting Help 229 | 230 | - Create an issue for bugs or features 231 | - Join discussions in GitHub Discussions 232 | - Tag maintainers in complex issues 233 | - Check existing issues and PRs first 234 | 235 | ## Code of Conduct 236 | 237 | Please note that PyVisionAI has a [Code of Conduct](CODE_OF_CONDUCT.md). By participating in this project, you agree to abide by its terms. 238 | 239 | ## Development Guidelines 240 | 241 | ### Vision Model Integration 242 | 243 | When implementing or modifying vision model support: 244 | 245 | 1. **Model Interface** 246 | - Implement the `BaseVisionModel` interface 247 | - Handle API key validation and configuration 248 | - Implement proper retry logic for API calls 249 | - Follow the established error handling patterns 250 | 251 | 2. **Testing** 252 | - Add comprehensive unit tests 253 | - Include API error simulation tests 254 | - Add integration tests with real API calls 255 | - Ensure tests can run without API keys (using skip markers) 256 | 257 | 3. **Documentation** 258 | - Update API documentation 259 | - Add usage examples 260 | - Document environment variables 261 | - Update CLI help messages 262 | 263 | 4. **Error Handling** 264 | - Use appropriate exception classes 265 | - Add descriptive error messages 266 | - Implement proper retry logic 267 | - Handle rate limits gracefully 268 | 269 | ### Model-Specific Guidelines 270 | 271 | 1. **GPT-4 Vision** 272 | - Follow OpenAI's best practices 273 | - Handle token limits appropriately 274 | - Implement proper error handling for API responses 275 | - Use appropriate model versions 276 | 277 | 2. **Claude Vision** 278 | - Follow Anthropic's guidelines 279 | - Handle API rate limits 280 | - Implement proper retry logic 281 | - Use appropriate model versions 282 | 283 | 3. **Llama Vision** 284 | - Handle local model availability 285 | - Implement proper error handling 286 | - Support custom model configurations 287 | - Handle resource constraints 288 | -------------------------------------------------------------------------------- /THIRD_PARTY_LICENSES.md: -------------------------------------------------------------------------------- 1 | # Third-Party Software Licenses and Acknowledgments 2 | 3 | This document lists the third-party software packages used by PyVisionAI, along with their respective licenses and acknowledgments. 4 | 5 | ## Core Dependencies 6 | 7 | ### Playwright 8 | - **Purpose**: HTML processing and browser automation 9 | - **Copyright**: Copyright (c) Microsoft Corporation 10 | - **License**: Apache License 2.0 11 | - **Notice**: Playwright may download browser binaries during installation. These binaries are subject to their respective licenses: 12 | - Chromium: BSD-style license 13 | - Firefox: Mozilla Public License 2.0 14 | - WebKit: LGPL/BSD-style license 15 | 16 | ### Python-PPTX 17 | - **Purpose**: PowerPoint file processing 18 | - **License**: MIT License 19 | 20 | ### Python-DOCX 21 | - **Purpose**: Word document processing 22 | - **License**: MIT License 23 | 24 | ### PyPDF2 25 | - **Purpose**: PDF processing 26 | - **License**: BSD License 27 | 28 | ### Pillow (PIL) 29 | - **Purpose**: Image processing 30 | - **License**: HPND License 31 | 32 | ## License Texts 33 | 34 | ### Apache License 2.0 (Playwright) 35 | ``` 36 | Apache License 37 | Version 2.0, January 2004 38 | http://www.apache.org/licenses/ 39 | 40 | Licensed under the Apache License, Version 2.0 (the "License"); 41 | you may not use this file except in compliance with the License. 42 | You may obtain a copy of the License at 43 | 44 | http://www.apache.org/licenses/LICENSE-2.0 45 | 46 | Unless required by applicable law or agreed to in writing, software 47 | distributed under the License is distributed on an "AS IS" BASIS, 48 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 49 | See the License for the specific language governing permissions and 50 | limitations under the License. 51 | ``` 52 | 53 | Note: Full license texts for other dependencies can be found in their respective repositories. 54 | -------------------------------------------------------------------------------- /TODO.md: -------------------------------------------------------------------------------- 1 | ## PyVisionAI Improvements 2 | 3 | ### 1. Enhanced Model Support (High Priority) 4 | - [x] Implement model factory pattern for better extensibility 5 | - [x] Add proper logging for model operations 6 | - [x] Ensure backward compatibility with existing model implementations 7 | - [ ] Add support for additional vision models based on user demand 8 | - [x] Implement model configuration validation system 9 | 10 | ### 2. Robust Error Handling (High Priority) 11 | - [x] Implement comprehensive logging system 12 | - [x] Add proper error context and stack traces 13 | - [x] Create retry mechanism for API failures 14 | - [ ] Integrate retry mechanism with API calls 15 | - [ ] Add retry for Ollama API (connection errors, rate limits, server errors) 16 | - [ ] Add retry for OpenAI API (rate limits, server errors, timeouts) 17 | - [ ] Add tests for retry behavior with mocked API responses 18 | - [ ] Implement graceful degradation for model failures 19 | - [ ] Add request timeout handling 20 | 21 | ### 3. Performance Optimization (Medium Priority) 22 | - [ ] Implement adaptive concurrency based on system resources 23 | - [ ] Add caching mechanism for frequent requests 24 | - [ ] Optimize image preprocessing 25 | - [ ] Implement batch processing improvements 26 | - [ ] Add performance monitoring metrics 27 | 28 | ### 4. Testing Improvements (Medium Priority) 29 | - [x] Add comprehensive logging tests 30 | - [x] Improve test coverage for model factory 31 | - [x] Add retry mechanism tests 32 | - [ ] Add performance regression tests 33 | - [ ] Implement integration test suite 34 | - [ ] Add stress testing for concurrent operations 35 | 36 | ### 5. Documentation Updates (Low Priority) 37 | - [x] Update API documentation with model factory pattern 38 | - [ ] Add examples for custom model implementation 39 | - [ ] Create troubleshooting guide 40 | - [ ] Document performance optimization strategies 41 | - [ ] Add architecture decision records 42 | - [ ] Add retry mechanism configuration examples 43 | 44 | ### Dependencies and Notes 45 | - Model support improvements depend on user demand and API availability 46 | - Performance optimizations should be driven by real-world usage patterns 47 | - Documentation updates should follow major feature implementations 48 | 49 | ### Added 50 | - Implemented Model Factory pattern for vision models: 51 | - Added VisionModel base class with abstract methods 52 | - Added ModelFactory for centralized model management 53 | - Added concrete implementations for GPT4 and Llama models 54 | - Added comprehensive logging for model lifecycle 55 | - Added configuration validation for each model type 56 | - Added retry mechanism for handling transient failures: 57 | - Implemented RetryManager with configurable strategies 58 | - Added support for exponential, linear, and constant backoff 59 | - Added comprehensive logging for retry attempts 60 | - Added proper error handling and delay management 61 | 62 | ### Completed 63 | ### Retry Mechanism (2024-02-08) 64 | - [x] Basic retry manager implementation with configurable strategies 65 | - [x] Custom error hierarchy for different failure types 66 | - [x] Integration with Ollama and OpenAI API calls 67 | - [x] Comprehensive test suite for retry scenarios 68 | - [x] Logging for retry attempts and failures 69 | 70 | ## Pending Improvements 71 | 72 | ### Test Suite Optimization (High Priority) 73 | - [ ] Reduce test execution time (currently 3m36s) 74 | - [ ] Implement global time.sleep mocking 75 | - [ ] Add test categorization (@pytest.mark.slow/@pytest.mark.fast) 76 | - [ ] Create shared mock API response fixtures 77 | - [ ] Mock file operations in extraction tests 78 | - [ ] Add parallel test execution where possible 79 | - [ ] Improve test organization 80 | - [ ] Group tests by execution speed 81 | - [ ] Create common fixtures for API responses 82 | - [ ] Standardize mock data across test suite 83 | - [ ] Maintain test coverage while improving speed 84 | - [ ] Add integration test suite for critical paths 85 | - [ ] Keep selected end-to-end tests unmocked 86 | - [ ] Add test execution time monitoring 87 | 88 | ### Retry Mechanism Enhancements (Medium Priority) 89 | - [ ] Add jitter to retry delays to prevent thundering herd 90 | - [ ] Support Retry-After header for rate limits 91 | - [ ] Implement circuit breaker pattern for persistent failures 92 | - [ ] Add retry budget/quota management 93 | - [ ] Add tests for different retry strategies (LINEAR, CONSTANT) 94 | - [ ] Add edge case tests for delay calculations 95 | - [ ] Add timeout handling tests 96 | - [ ] Add invalid configuration tests 97 | 98 | ### Testing Improvements (Medium Priority) 99 | - [ ] Add performance regression tests 100 | - [ ] Enhance integration testing 101 | - [ ] Add stress testing for retry mechanism 102 | - [ ] Add concurrent operation tests 103 | 104 | ### Documentation Updates (Low Priority) 105 | - [ ] Add retry mechanism usage examples 106 | - [ ] Document retry configuration options 107 | - [ ] Add troubleshooting guide for API errors 108 | - [ ] Update API documentation with retry behavior 109 | 110 | ### TODO 111 | - [ ] Integrate retry mechanism with API calls: 112 | - [ ] Add retry for Ollama API calls (connection errors, rate limits, server errors) 113 | - [ ] Add retry for OpenAI API calls (rate limits, server errors, timeouts) 114 | - [ ] Add tests to verify retry behavior with mocked API responses 115 | - [ ] Update documentation with retry configuration examples 116 | -------------------------------------------------------------------------------- /commit_message.txt: -------------------------------------------------------------------------------- 1 | feat(api): Add retry mechanism for API calls 2 | 3 | Add robust retry mechanism to handle transient API failures: 4 | - Add RetryManager with configurable retry strategies (exponential, linear, constant) 5 | - Add custom error hierarchy (RetryableError, APIError, RateLimitError, etc.) 6 | - Integrate retry with Ollama and OpenAI API calls 7 | - Add comprehensive test suite with mocked API responses 8 | - Add logging for retry attempts and failures 9 | 10 | The retry mechanism handles: 11 | - Connection errors (network issues) 12 | - Rate limiting (429 responses) 13 | - Server errors (5xx responses) 14 | - API-specific errors (OpenAI/Ollama) 15 | 16 | Tests: 17 | - Add test_api_retry.py with mocked API responses 18 | - Add test coverage for all retry scenarios 19 | - Mock time.sleep for faster test execution 20 | 21 | This change improves reliability of API calls without breaking existing functionality. 22 | No migration needed for existing code. 23 | -------------------------------------------------------------------------------- /content/extracted/.include: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MDGrey33/pyvisionai/da23f86e12e6576f5f65bf39f32f424896d853b3/content/extracted/.include -------------------------------------------------------------------------------- /content/log/.include: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MDGrey33/pyvisionai/da23f86e12e6576f5f65bf39f32f424896d853b3/content/log/.include -------------------------------------------------------------------------------- /content/source/.include: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MDGrey33/pyvisionai/da23f86e12e6576f5f65bf39f32f424896d853b3/content/source/.include -------------------------------------------------------------------------------- /content/test/source/test.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MDGrey33/pyvisionai/da23f86e12e6576f5f65bf39f32f424896d853b3/content/test/source/test.docx -------------------------------------------------------------------------------- /content/test/source/test.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Exploring Nature 5 | 62 | 63 | 64 |
65 |

Exploring Nature

66 |

An Interactive Journey Through Natural Habitats

67 |
68 | 69 |
70 |

Introduction

71 |

Nature offers an endless canvas of beauty, diversity, and inspiration. In this interactive document, we will explore the importance of preserving natural habitats and celebrating their uniqueness.

72 | 73 |

Key Points

74 |

1. Biodiversity

75 | 76 |
77 |

Global Biodiversity Numbers:

78 | 85 |

Data updates every few seconds to simulate live tracking

86 |
87 | 88 |

2. Human Connection

89 |

Nature provides psychological and physical benefits. Activities like hiking and birdwatching foster a sense of harmony with the environment.

90 | 91 |
92 | Forest Scene 93 |

Click the image to zoom - A lush forest scene showcasing natural biodiversity

94 |
95 | 96 |

Case Study: Amazon Rainforest

97 |

The Amazon Rainforest, often called the "lungs of the Earth," produces 20% of the world's oxygen and is home to countless unique species.

98 |
99 | 100 | 157 | 158 | 159 | -------------------------------------------------------------------------------- /content/test/source/test.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MDGrey33/pyvisionai/da23f86e12e6576f5f65bf39f32f424896d853b3/content/test/source/test.jpeg -------------------------------------------------------------------------------- /content/test/source/test.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MDGrey33/pyvisionai/da23f86e12e6576f5f65bf39f32f424896d853b3/content/test/source/test.pdf -------------------------------------------------------------------------------- /content/test/source/test.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MDGrey33/pyvisionai/da23f86e12e6576f5f65bf39f32f424896d853b3/content/test/source/test.pptx -------------------------------------------------------------------------------- /docs/api/file_extractor.md: -------------------------------------------------------------------------------- 1 | ```mermaid 2 | graph TD 3 | classDef function fill:#4169E1,stroke:#000080,stroke-width:2px,color:white 4 | classDef required fill:#ff6666,stroke:#ff0000,stroke-width:2px,color:black 5 | classDef optional fill:#98FB98,stroke:#00ff00,stroke-width:2px,color:black 6 | classDef default fill:#87CEEB,stroke:#4682B4,stroke-width:2px,color:black 7 | classDef example fill:#FFE4B5,stroke:#FFD700,stroke-width:2px,color:black 8 | 9 | subgraph IMPORTS["📦 IMPORTS"] 10 | Import["from pyvisionai import create_extractor"] 11 | end 12 | 13 | subgraph FUNCTIONS["🔵 FUNCTIONS"] 14 | Create["create_extractor()"] 15 | Extract["extractor.extract()"] 16 | end 17 | 18 | subgraph EXAMPLES["✨ EXAMPLES"] 19 | CreateExample["extractor = create_extractor('pdf', extractor_type='text_and_images', model='gpt4')"] 20 | ExtractExample["output_path = extractor.extract('document.pdf', 'output_dir')"] 21 | end 22 | 23 | subgraph CREATE_PARAMS["📝 create_extractor Parameters"] 24 | CreateRequired["🔴 Required: 25 | file_type: str (pdf|docx|pptx|html)"] 26 | 27 | CreateOptional["🟢 Optional: 28 | extractor_type: str = 'page_as_image' 29 | model: str = 'gpt4' 30 | api_key: str = None (from env) 31 | prompt: str = DEFAULT_PROMPT"] 32 | end 33 | 34 | subgraph EXTRACT_PARAMS["📝 extract Parameters"] 35 | ExtractRequired["🔴 Required: 36 | file_path: str 37 | output_dir: str"] 38 | 39 | ExtractReturn["Returns: str 40 | Path to generated markdown file"] 41 | end 42 | 43 | Import --> Create 44 | Create --> CreateRequired & CreateOptional 45 | Create --> Extract 46 | Extract --> ExtractRequired --> ExtractReturn 47 | CreateRequired & CreateOptional --> CreateExample 48 | ExtractRequired --> ExtractExample 49 | 50 | class Create,Extract function 51 | class CreateRequired,ExtractRequired required 52 | class CreateOptional optional 53 | class CreateExample,ExtractExample example 54 | ``` 55 | -------------------------------------------------------------------------------- /docs/api/image_description.md: -------------------------------------------------------------------------------- 1 | ```mermaid 2 | graph TD 3 | classDef function fill:#4169E1,stroke:#000080,stroke-width:2px,color:white 4 | classDef required fill:#ff6666,stroke:#ff0000,stroke-width:2px,color:black 5 | classDef optional fill:#98FB98,stroke:#00ff00,stroke-width:2px,color:black 6 | classDef default fill:#87CEEB,stroke:#4682B4,stroke-width:2px,color:black 7 | classDef example fill:#FFE4B5,stroke:#FFD700,stroke-width:2px,color:black 8 | 9 | subgraph IMPORTS["📦 IMPORTS"] 10 | Import["from pyvisionai import 11 | describe_image_openai, 12 | describe_image_ollama"] 13 | end 14 | 15 | subgraph FUNCTIONS["🔵 FUNCTIONS"] 16 | OpenAI["describe_image_openai()"] 17 | Ollama["describe_image_ollama()"] 18 | end 19 | 20 | subgraph EXAMPLES["✨ EXAMPLES"] 21 | OpenAIExample["description = describe_image_openai('image.jpg', model='gpt4', api_key='key', prompt='custom prompt')"] 22 | 23 | OllamaExample["description = describe_image_ollama('image.jpg', model='llama3.2-vision', prompt='custom prompt')"] 24 | end 25 | 26 | subgraph OPENAI_PARAMS["📝 OpenAI Parameters"] 27 | OpenAIRequired["🔴 Required: 28 | image_path: str"] 29 | 30 | OpenAIOptional["🟢 Optional: 31 | model: str = 'gpt-4-vision-preview' 32 | api_key: str = None (from env) 33 | prompt: str = DEFAULT_PROMPT 34 | max_tokens: int = 300"] 35 | end 36 | 37 | subgraph OLLAMA_PARAMS["📝 Ollama Parameters"] 38 | OllamaRequired["🔴 Required: 39 | image_path: str"] 40 | 41 | OllamaOptional["🟢 Optional: 42 | model: str = 'llama3.2-vision' 43 | prompt: str = DEFAULT_PROMPT"] 44 | end 45 | 46 | Import --> OpenAI & Ollama 47 | OpenAI --> OpenAIRequired & OpenAIOptional --> OpenAIExample 48 | Ollama --> OllamaRequired & OllamaOptional --> OllamaExample 49 | 50 | class OpenAI,Ollama function 51 | class OpenAIRequired,OllamaRequired required 52 | class OpenAIOptional,OllamaOptional optional 53 | class OpenAIExample,OllamaExample example 54 | -------------------------------------------------------------------------------- /docs/cli/file_extractor.md: -------------------------------------------------------------------------------- 1 | 2 | ```mermaid 3 | graph TD 4 | classDef required fill:#ff6666,stroke:#ff0000,stroke-width:2px,color:black 5 | classDef optional fill:#98FB98,stroke:#00ff00,stroke-width:2px,color:black 6 | classDef default fill:#87CEEB,stroke:#4682B4,stroke-width:2px,color:black 7 | classDef example fill:#FFE4B5,stroke:#FFD700,stroke-width:2px,color:black 8 | 9 | CLI(["file-extract"]) 10 | 11 | subgraph EXAMPLES["✨ EXAMPLES"] 12 | Basic["Quickstart: 13 | file-extract -t pdf -s document.pdf -o ./output"] 14 | 15 | Directory["Directory: 16 | file-extract -t pdf -s ./docs -o ./output"] 17 | 18 | Advanced["Advanced: 19 | file-extract -t pdf -s document.pdf -o ./output -e text_and_images -m llama"] 20 | end 21 | 22 | subgraph OPTIONAL["🟢 OPTIONAL"] 23 | Extractor["--extractor, -e 24 | 📄 Extraction Method 25 | text_and_images | page_as_image"] 26 | 27 | Model["--model, -m 28 | 🤖 Model Choice 29 | llama | gpt4"] 30 | 31 | Key["--api-key, -k 32 | 🔑 OpenAI Key"] 33 | 34 | Prompt["--prompt, -p 35 | 💭 Custom Instructions"] 36 | end 37 | 38 | subgraph REQUIRED["🔴 REQUIRED"] 39 | Type["--type, -t 40 | 📄 File Type 41 | pdf | docx | pptx | html"] 42 | 43 | Source["--source, -s 44 | 📥 Source Path 45 | (file or directory)"] 46 | 47 | Output["--output, -o 48 | 📤 Output Directory"] 49 | end 50 | 51 | subgraph DEFAULTS["🔵 DEFAULTS"] 52 | ExtractorDefault["📄 page_as_image"] 53 | ModelDefault["🤖 gpt4"] 54 | KeyDefault["🔑 From ENV (OPENAI_API_KEY)"] 55 | PromptDefault["💭 Describe this image in detail. 56 | Preserve as much of the precise original 57 | text, format, images and style as possible."] 58 | SourceDefault["📥 content/source"] 59 | OutputDefault["📤 content/extracted"] 60 | end 61 | 62 | CLI --> Type & Source & Output 63 | CLI --> Extractor & Model & Key & Prompt 64 | 65 | Extractor --> ExtractorDefault 66 | Model --> ModelDefault 67 | Key --> KeyDefault 68 | Prompt --> PromptDefault 69 | Source --> SourceDefault 70 | Output --> OutputDefault 71 | 72 | class Type,Source,Output required 73 | class Extractor,Model,Key,Prompt optional 74 | class ExtractorDefault,ModelDefault,KeyDefault,PromptDefault,SourceDefault,OutputDefault default 75 | class Basic,Directory,Advanced example 76 | -------------------------------------------------------------------------------- /docs/cli/image_description.md: -------------------------------------------------------------------------------- 1 | 2 | ```mermaid 3 | graph TD 4 | classDef required fill:#ff6666,stroke:#ff0000,stroke-width:2px,color:black 5 | classDef optional fill:#98FB98,stroke:#00ff00,stroke-width:2px,color:black 6 | classDef default fill:#87CEEB,stroke:#4682B4,stroke-width:2px,color:black 7 | classDef example fill:#FFE4B5,stroke:#FFD700,stroke-width:2px,color:black 8 | 9 | CLI(["describe-image"]) 10 | 11 | subgraph EXAMPLES["✨ EXAMPLES"] 12 | Basic["Quickstart: 13 | describe-image -i photo.jpg"] 14 | 15 | Local["Local Model: 16 | describe-image -i photo.jpg -u llama"] 17 | end 18 | 19 | subgraph OPTIONAL["🟢 OPTIONAL"] 20 | Model["--use-case, -u 21 | 🤖 Model Choice 22 | gpt4 | llama"] 23 | 24 | Key["--api-key, -k 25 | 🔑 OpenAI Key"] 26 | 27 | Verbose["--verbose, -v 28 | 📝 Detailed Output"] 29 | 30 | Prompt["--prompt, -p 31 | 💭 Custom Instructions"] 32 | end 33 | 34 | subgraph REQUIRED["🔴 REQUIRED"] 35 | Image["--image, -i 36 | 📸 Image File Path"] 37 | end 38 | 39 | subgraph DEFAULTS["🔵 DEFAULTS"] 40 | ModelDefault["🤖 gpt4"] 41 | KeyDefault["🔑 From ENV (OPENAI_API_KEY)"] 42 | VerboseDefault["📝 Off"] 43 | PromptDefault["💭 Describe this image in detail"] 44 | end 45 | 46 | CLI --> Image 47 | CLI --> Model & Key & Verbose & Prompt 48 | 49 | Model --> ModelDefault 50 | Key --> KeyDefault 51 | Verbose --> VerboseDefault 52 | Prompt --> PromptDefault 53 | 54 | class Image required 55 | class Model,Key,Verbose,Prompt optional 56 | class ModelDefault,KeyDefault,VerboseDefault,PromptDefault default 57 | class Basic,Local example 58 | -------------------------------------------------------------------------------- /docs/getting_started.md: -------------------------------------------------------------------------------- 1 | # Getting Started with PyVisionAI 2 | 3 | This guide will help you get started with PyVisionAI, a powerful tool for extracting and describing content from documents using Vision Language Models. 4 | 5 | ## Prerequisites 6 | 7 | Before using PyVisionAI, ensure you have: 8 | 9 | 1. Python 3.11 or higher installed 10 | 2. System dependencies installed: 11 | ```bash 12 | # macOS 13 | brew install --cask libreoffice # For DOCX/PPTX 14 | brew install poppler # For PDF 15 | 16 | # Ubuntu/Debian 17 | sudo apt-get install -y libreoffice poppler-utils 18 | 19 | # Windows 20 | # Install LibreOffice and Poppler manually 21 | ``` 22 | 23 | ## Installation 24 | 25 | 1. **Install via pip** 26 | ```bash 27 | pip install pyvisionai 28 | ``` 29 | 30 | 2. Set up environment variables: 31 | ```bash 32 | # For OpenAI Vision (recommended) 33 | export OPENAI_API_KEY='your-api-key' 34 | 35 | ## Configuration 36 | 37 | ### API Keys 38 | 39 | 1. **Cloud-based Models (Recommended)** 40 | 41 | Choose one or both of the following: 42 | 43 | ```bash 44 | # For GPT-4 Vision 45 | export OPENAI_API_KEY='your-openai-key' 46 | 47 | # For Claude Vision 48 | export ANTHROPIC_API_KEY='your-anthropic-key' 49 | ``` 50 | 51 | 2. **Local Model (Optional)** 52 | ```bash 53 | # First install and start Ollama 54 | brew install ollama # macOS 55 | # Or for Linux 56 | curl -fsSL https://ollama.com/install.sh | sh 57 | # Or for Windows, download from: 58 | # https://ollama.com/download/windows 59 | 60 | ollama serve 61 | ollama pull llama3.2-vision 62 | ``` 63 | 64 | ## Basic Usage 65 | 66 | ### Command Line Interface 67 | 68 | 1. **Extract Content from Documents** 69 | ```bash 70 | # Basic usage (uses GPT-4 Vision by default) 71 | file-extract -t pdf -s document.pdf -o output_dir 72 | 73 | # Using Claude Vision 74 | file-extract -t pdf -s document.pdf -o output_dir -m claude 75 | 76 | # Using local Llama model 77 | file-extract -t pdf -s document.pdf -o output_dir -m llama 78 | ``` 79 | 80 | 2. **Describe Images** 81 | ```bash 82 | # Using GPT-4 Vision (default) 83 | describe-image -i image.jpg 84 | 85 | # Using Claude Vision 86 | describe-image -i image.jpg -u claude -k your-anthropic-key 87 | 88 | # Using local Llama model 89 | describe-image -i image.jpg -u llama 90 | 91 | # With custom prompt 92 | describe-image -i image.jpg -p "Describe the main colors and objects" 93 | ``` 94 | 95 | ### Python Library 96 | 97 | ```python 98 | from pyvisionai import ( 99 | create_extractor, 100 | describe_image_openai, 101 | describe_image_claude, 102 | describe_image_ollama 103 | ) 104 | 105 | # 1. Extract content from documents 106 | # Using GPT-4 Vision (default) 107 | extractor = create_extractor("pdf") 108 | output_path = extractor.extract("document.pdf", "output_dir") 109 | 110 | # Using Claude Vision 111 | extractor = create_extractor("pdf", model="claude") 112 | output_path = extractor.extract("document.pdf", "output_dir") 113 | 114 | # Using local Llama model 115 | extractor = create_extractor("pdf", model="llama") 116 | output_path = extractor.extract("document.pdf", "output_dir") 117 | 118 | # 2. Describe images 119 | # Using GPT-4 Vision 120 | description = describe_image_openai( 121 | "image.jpg", 122 | api_key="your-openai-key", # optional if set in environment 123 | prompt="Describe the main objects" # optional 124 | ) 125 | 126 | # Using Claude Vision 127 | description = describe_image_claude( 128 | "image.jpg", 129 | api_key="your-anthropic-key", # optional if set in environment 130 | prompt="List the colors present" # optional 131 | ) 132 | 133 | # Using local Llama model 134 | description = describe_image_ollama( 135 | "image.jpg", 136 | prompt="Describe the scene" # optional 137 | ) 138 | ``` 139 | 140 | ## Supported File Types 141 | 142 | - PDF (`.pdf`) 143 | - Word Documents (`.docx`) 144 | - PowerPoint Presentations (`.pptx`) 145 | - HTML Pages (`.html`, `.htm`) 146 | 147 | ## Vision Models 148 | 149 | 1. **GPT-4 Vision (Default)** 150 | - Cloud-based model by OpenAI 151 | - Requires API key 152 | - Best for general-purpose image description 153 | - Supports detailed custom prompts 154 | 155 | 2. **Claude Vision** 156 | - Cloud-based model by Anthropic 157 | - Requires API key 158 | - Excellent for detailed analysis 159 | - Strong at identifying text in images 160 | 161 | 3. **Llama Vision** 162 | - Local model via Ollama 163 | - No API key required 164 | - Good for basic image description 165 | - Runs entirely on your machine 166 | 167 | ## Extraction Methods 168 | 169 | 1. **page_as_image (Default)** 170 | - Converts each page to an image 171 | - Sends to vision model for description 172 | - Best for maintaining layout 173 | - Works with all file types 174 | 175 | 2. **text_and_images** 176 | - Extracts text and images separately 177 | - More efficient for text-heavy documents 178 | - Better for searchable output 179 | - Not available for HTML files 180 | 181 | ## Output Format 182 | 183 | The extracted content is saved in markdown format: 184 | 185 | ```markdown 186 | # Document Title 187 | 188 | ## Page 1 189 | [Description of page content] 190 | 191 | ### Extracted Text 192 | [Text content if available] 193 | 194 | ### Images 195 | 1. [Description of image 1] 196 | 2. [Description of image 2] 197 | 198 | ## Page 2 199 | ... 200 | ``` 201 | 202 | ## Advanced Usage 203 | 204 | ### Custom Prompts 205 | 206 | ```bash 207 | # CLI 208 | describe-image -i image.jpg -p "List all visible text in the image" 209 | 210 | # Python 211 | description = describe_image_claude( 212 | "image.jpg", 213 | prompt="Identify and transcribe any visible text" 214 | ) 215 | ``` 216 | 217 | ### Batch Processing 218 | 219 | ```bash 220 | # Process all PDFs in a directory 221 | file-extract -t pdf -s input_dir -o output_dir 222 | 223 | # Process with specific model 224 | file-extract -t pdf -s input_dir -o output_dir -m claude 225 | ``` 226 | 227 | ### Error Handling 228 | 229 | ```python 230 | try: 231 | description = describe_image_claude("image.jpg") 232 | except ValueError as e: 233 | print(f"Configuration error: {e}") 234 | except Exception as e: 235 | print(f"Error processing image: {e}") 236 | ``` 237 | 238 | ## Next Steps 239 | 240 | 1. Read the [API Documentation](api.md) for detailed reference 241 | 2. Check out the [Examples](../examples/) directory 242 | 3. Learn about [Testing](testing.md) your implementations 243 | 4. Review [Contributing Guidelines](../CONTRIBUTING.md) 244 | 245 | ## Common Issues 246 | 247 | 1. **Missing Dependencies** 248 | ```bash 249 | # If you see LibreOffice errors: 250 | brew install --cask libreoffice # macOS 251 | 252 | # If you see Poppler errors: 253 | brew install poppler # macOS 254 | ``` 255 | 256 | 2. **Memory Issues with Large Files** 257 | - Use `text_and_images` method instead of `page_as_image` 258 | - Process files in smaller batches 259 | - Increase system swap space if needed 260 | 261 | 3. **Slow Processing** 262 | - Consider using cloud-based GPT-4 Vision for faster results 263 | - Process files in parallel for batch operations 264 | - Use SSD storage for better I/O performance 265 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # PyVisionAI Examples 2 | 3 | This directory contains practical examples demonstrating various use cases of PyVisionAI. Each example is self-contained and includes detailed comments explaining the code. 4 | 5 | ## Examples List 6 | 7 | 1. `basic_extraction.py` - Simple examples of extracting content from different file types 8 | 2. `batch_processing.py` - Process multiple files in parallel with progress tracking 9 | 3. `custom_prompts.py` - Examples of using custom prompts for different document types 10 | 4. `web_scraping.py` - Extract content from web pages and dynamic websites 11 | 5. `privacy_focused.py` - Using local Llama model for private document processing 12 | 6. `output_customization.py` - Customize the output format and organization 13 | 7. `error_handling.py` - Proper error handling and logging examples 14 | 8. `memory_optimization.py` - Techniques for processing large documents efficiently 15 | 16 | ## Running the Examples 17 | 18 | 1. Install dependencies: 19 | ```bash 20 | pip install pyvisionai 21 | ``` 22 | 23 | 2. Set up your environment variables: 24 | ```bash 25 | export OPENAI_API_KEY='your-api-key' # For OpenAI examples 26 | ``` 27 | 28 | 3. Run any example: 29 | ```bash 30 | python examples/basic_extraction.py 31 | ``` 32 | 33 | ## Example Data 34 | 35 | The `example_data/` directory contains sample files for testing: 36 | - PDF documents 37 | - Word documents 38 | - PowerPoint presentations 39 | - HTML files 40 | - Images 41 | 42 | ## Contributing 43 | 44 | Feel free to add your own examples! Please follow these guidelines: 45 | 1. Include detailed comments 46 | 2. Handle errors appropriately 47 | 3. Follow the existing naming convention 48 | 4. Update this README with your example 49 | -------------------------------------------------------------------------------- /examples/basic_extraction.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Basic extraction examples using PyVisionAI. 4 | 5 | This script demonstrates the most common use cases for extracting content 6 | from different types of files using PyVisionAI. 7 | """ 8 | 9 | import os 10 | 11 | from pyvisionai import create_extractor, describe_image_openai 12 | 13 | 14 | def ensure_dir(directory): 15 | """Create directory if it doesn't exist.""" 16 | if not os.path.exists(directory): 17 | os.makedirs(directory) 18 | 19 | 20 | def example_pdf_extraction(): 21 | """Example: Extract content from a PDF file.""" 22 | print("\n=== PDF Extraction Example ===") 23 | 24 | # Create PDF extractor with default settings (page_as_image + GPT-4 Vision) 25 | extractor = create_extractor("pdf") 26 | 27 | try: 28 | # Extract content from PDF 29 | input_file = "example_data/sample.pdf" 30 | output_path = extractor.extract( 31 | input_file, os.path.join("output", "pdf") 32 | ) 33 | print(f"PDF content extracted to: {output_path}") 34 | except FileNotFoundError: 35 | print( 36 | "Error processing technical doc: File not found - example_data/sample.pdf" 37 | ) 38 | except Exception as e: 39 | print( 40 | f"Error processing technical doc: {type(e).__name__}: {str(e)}" 41 | ) 42 | 43 | 44 | def example_docx_extraction(): 45 | """Example: Extract content from a Word document.""" 46 | print("\n=== Word Document Extraction Example ===") 47 | 48 | # Create DOCX extractor using text_and_images method 49 | extractor = create_extractor( 50 | "docx", extractor_type="text_and_images" 51 | ) 52 | 53 | try: 54 | # Extract content from DOCX 55 | input_file = "example_data/sample.docx" 56 | output_path = extractor.extract( 57 | input_file, os.path.join("output", "docx") 58 | ) 59 | print(f"DOCX content extracted to: {output_path}") 60 | except FileNotFoundError: 61 | print( 62 | "Error processing technical doc: File not found - example_data/sample.docx" 63 | ) 64 | except Exception as e: 65 | print( 66 | f"Error processing technical doc: {type(e).__name__}: {str(e)}" 67 | ) 68 | 69 | 70 | def example_pptx_extraction(): 71 | """Example: Extract content from a PowerPoint presentation.""" 72 | print("\n=== PowerPoint Extraction Example ===") 73 | 74 | # Create PPTX extractor with custom prompt 75 | extractor = create_extractor( 76 | "pptx", 77 | prompt="List all text content and describe any diagrams or charts", 78 | ) 79 | 80 | try: 81 | # Extract content from PPTX 82 | input_file = "example_data/sample.pptx" 83 | output_path = extractor.extract( 84 | input_file, os.path.join("output", "pptx") 85 | ) 86 | print(f"PPTX content extracted to: {output_path}") 87 | except FileNotFoundError: 88 | print( 89 | "Error processing technical doc: File not found - example_data/sample.pptx" 90 | ) 91 | except Exception as e: 92 | print( 93 | f"Error processing technical doc: {type(e).__name__}: {str(e)}" 94 | ) 95 | 96 | 97 | def example_html_extraction(): 98 | """Example: Extract content from a web page.""" 99 | print("\n=== Web Page Extraction Example ===") 100 | 101 | # Create HTML extractor 102 | extractor = create_extractor("html") 103 | 104 | try: 105 | # Extract content from HTML 106 | output_path = extractor.extract( 107 | "https://example.com", os.path.join("output", "html") 108 | ) 109 | print(f"HTML content extracted to: {output_path}") 110 | except Exception as e: 111 | print(f"Error extracting HTML: {type(e).__name__}: {str(e)}") 112 | 113 | 114 | def example_image_description(): 115 | """Example: Describe individual images.""" 116 | print("\n=== Image Description Example ===") 117 | 118 | try: 119 | # Describe image using OpenAI Vision 120 | input_file = "example_data/sample_image.jpg" 121 | description = describe_image_openai( 122 | input_file, 123 | prompt="Describe the main elements and any text in this image", 124 | ) 125 | print("Image Description:") 126 | print(description) 127 | except FileNotFoundError: 128 | print( 129 | "Error analyzing chart: File not found - example_data/sample_image.jpg" 130 | ) 131 | except Exception as e: 132 | print(f"Error analyzing chart: {type(e).__name__}: {str(e)}") 133 | 134 | 135 | def main(): 136 | """Run all examples.""" 137 | # Create output directories 138 | ensure_dir(os.path.join("output", "pdf")) 139 | ensure_dir(os.path.join("output", "docx")) 140 | ensure_dir(os.path.join("output", "pptx")) 141 | ensure_dir(os.path.join("output", "html")) 142 | 143 | # Run examples 144 | example_pdf_extraction() 145 | example_docx_extraction() 146 | example_pptx_extraction() 147 | example_html_extraction() 148 | example_image_description() 149 | 150 | print("\nAll examples completed!") 151 | 152 | 153 | if __name__ == "__main__": 154 | main() 155 | -------------------------------------------------------------------------------- /examples/batch_processing.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Batch processing example using PyVisionAI. 4 | 5 | This script demonstrates how to efficiently process multiple files 6 | in parallel with progress tracking and error handling. 7 | """ 8 | 9 | import os 10 | import time 11 | from concurrent.futures import ThreadPoolExecutor, as_completed 12 | from typing import Dict, List, Tuple 13 | 14 | from pyvisionai import create_extractor 15 | 16 | 17 | class BatchProcessor: 18 | """Handles batch processing of documents with progress tracking.""" 19 | 20 | def __init__(self, max_workers: int = 4): 21 | """ 22 | Initialize the batch processor. 23 | 24 | Args: 25 | max_workers: Maximum number of parallel workers 26 | """ 27 | self.max_workers = max_workers 28 | self.extractors: Dict[str, object] = { 29 | ".pdf": create_extractor("pdf"), 30 | ".docx": create_extractor("docx"), 31 | ".pptx": create_extractor("pptx"), 32 | ".html": create_extractor("html"), 33 | } 34 | 35 | def process_file( 36 | self, input_path: str, output_dir: str 37 | ) -> Tuple[str, bool, str]: 38 | """ 39 | Process a single file. 40 | 41 | Args: 42 | input_path: Path to input file 43 | output_dir: Output directory 44 | 45 | Returns: 46 | Tuple of (filename, success status, message) 47 | """ 48 | filename = os.path.basename(input_path) 49 | ext = os.path.splitext(filename)[1].lower() 50 | 51 | if ext not in self.extractors: 52 | return filename, False, "Unsupported file type" 53 | 54 | try: 55 | # Create file-specific output directory 56 | file_output_dir = os.path.join( 57 | output_dir, filename.replace(".", "_") 58 | ) 59 | os.makedirs(file_output_dir, exist_ok=True) 60 | 61 | # Extract content 62 | output_path = self.extractors[ext].extract( 63 | input_path, file_output_dir 64 | ) 65 | return ( 66 | filename, 67 | True, 68 | f"Processed successfully: {output_path}", 69 | ) 70 | 71 | except Exception as e: 72 | return filename, False, f"Error: {str(e)}" 73 | 74 | def process_directory( 75 | self, 76 | input_dir: str, 77 | output_dir: str, 78 | file_types: List[str] = None, 79 | ) -> Tuple[int, int, List[str]]: 80 | """ 81 | Process all supported files in a directory. 82 | 83 | Args: 84 | input_dir: Input directory path 85 | output_dir: Output directory path 86 | file_types: List of file extensions to process (default: all supported) 87 | 88 | Returns: 89 | Tuple of (successful count, failed count, error messages) 90 | """ 91 | if file_types is None: 92 | file_types = list(self.extractors.keys()) 93 | 94 | # Get list of files to process 95 | files_to_process = [] 96 | for root, _, files in os.walk(input_dir): 97 | for file in files: 98 | if any( 99 | file.lower().endswith(ext) for ext in file_types 100 | ): 101 | files_to_process.append(os.path.join(root, file)) 102 | 103 | if not files_to_process: 104 | return 0, 0, ["No files found to process"] 105 | 106 | # Create output directory 107 | os.makedirs(output_dir, exist_ok=True) 108 | 109 | # Process files in parallel 110 | successful = 0 111 | failed = 0 112 | errors = [] 113 | 114 | print(f"\nProcessing {len(files_to_process)} files...") 115 | start_time = time.time() 116 | 117 | with ThreadPoolExecutor( 118 | max_workers=self.max_workers 119 | ) as executor: 120 | # Submit all tasks 121 | future_to_file = { 122 | executor.submit(self.process_file, f, output_dir): f 123 | for f in files_to_process 124 | } 125 | 126 | # Process completed tasks 127 | for i, future in enumerate(as_completed(future_to_file), 1): 128 | filename, success, message = future.result() 129 | if success: 130 | successful += 1 131 | else: 132 | failed += 1 133 | errors.append(f"{filename}: {message}") 134 | 135 | # Print progress 136 | print( 137 | f"Progress: {i}/{len(files_to_process)} files " 138 | f"({successful} successful, {failed} failed)" 139 | ) 140 | 141 | # Print summary 142 | elapsed_time = time.time() - start_time 143 | print(f"\nProcessing completed in {elapsed_time:.2f} seconds") 144 | print(f"Successful: {successful}") 145 | print(f"Failed: {failed}") 146 | 147 | return successful, failed, errors 148 | 149 | 150 | def main(): 151 | """Run the batch processing example.""" 152 | # Initialize batch processor 153 | processor = BatchProcessor(max_workers=4) 154 | 155 | # Process all supported files in example_data directory 156 | successful, failed, errors = processor.process_directory( 157 | input_dir="example_data", output_dir="output/batch_results" 158 | ) 159 | 160 | # Print errors if any 161 | if errors: 162 | print("\nErrors encountered:") 163 | for error in errors: 164 | print(f"- {error}") 165 | 166 | 167 | if __name__ == "__main__": 168 | main() 169 | -------------------------------------------------------------------------------- /examples/example_data/README.md: -------------------------------------------------------------------------------- 1 | # Example Data 2 | 3 | This directory contains sample files for testing PyVisionAI functionality. These files are designed to demonstrate various features and use cases. 4 | 5 | ## File Types 6 | 7 | 1. PDF Documents 8 | - `sample.pdf` - A simple document with text and images 9 | - `complex.pdf` - A document with tables, charts, and complex layouts 10 | 11 | 2. Word Documents 12 | - `sample.docx` - Basic text document with embedded images 13 | - `report.docx` - Business report with tables and charts 14 | 15 | 3. PowerPoint Presentations 16 | - `sample.pptx` - Simple presentation with text and images 17 | - `charts.pptx` - Presentation focused on data visualization 18 | 19 | 4. HTML Files 20 | - `static.html` - Static webpage with text and images 21 | - `dynamic.html` - Page with dynamic content 22 | 23 | 5. Images 24 | - `sample_image.jpg` - Test image for direct description 25 | - `chart.png` - Sample chart for testing chart description 26 | - `diagram.png` - Technical diagram 27 | 28 | ## Usage 29 | 30 | These files are referenced in the example scripts. To use them: 31 | 32 | 1. Make sure this directory is in the same location as the example scripts 33 | 2. Run the examples: 34 | ```bash 35 | python examples/basic_extraction.py 36 | python examples/batch_processing.py 37 | ``` 38 | 39 | ## Contributing 40 | 41 | Feel free to add more example files, but please: 42 | 1. Keep files small and focused 43 | 2. Include files that demonstrate specific features 44 | 3. Update this README when adding new files 45 | 4. Avoid copyrighted content 46 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "pyvisionai" 3 | version = "0.3.1" 4 | description = "A Python library for extracting and describing content from documents using Vision LLMs" 5 | authors = ["MDGrey33 "] 6 | license = "Apache-2.0" 7 | readme = "README.md" 8 | repository = "https://github.com/MDGrey33/pyvisionai" 9 | keywords = ["pdf", "docx", "pptx", "html", "vision", "llm", "extraction"] 10 | classifiers = [ 11 | "Development Status :: 4 - Beta", 12 | "Intended Audience :: Developers", 13 | "License :: OSI Approved :: Apache Software License", 14 | "Programming Language :: Python :: 3", 15 | "Programming Language :: Python :: 3.12", 16 | ] 17 | packages = [ 18 | { include = "pyvisionai" } 19 | ] 20 | 21 | [tool.poetry.dependencies] 22 | python = "^3.11" 23 | requests = "^2.32.3" 24 | ollama = "^0.4.2" 25 | pillow = "^11.0.0" 26 | python-docx = "^1.1.2" 27 | python-pptx = "^1.0.2" 28 | openai = "^1.58.1" 29 | pdf2image = "^1.17.0" 30 | pdfminer-six = "^20231228" 31 | pypdf = "^4.1.0" 32 | playwright = "^1.41.0" 33 | anthropic = "^0.45.2" 34 | 35 | [tool.poetry.group.dev.dependencies] 36 | pytest = "^7.4.4" 37 | black = "^24.10.0" 38 | isort = "^5.13.2" 39 | pydocstyle = "^6.3.0" 40 | flake8 = "^7.0.0" 41 | flake8-pyproject = "^1.2.3" 42 | pytest-cov = "^4.1.0" 43 | pytest-order = "^1.2.0" 44 | 45 | [tool.poetry.scripts] 46 | file-extract = "pyvisionai.cli.extract:main" 47 | describe-image = "pyvisionai.cli.describe_image:main" 48 | 49 | [build-system] 50 | requires = ["poetry-core"] 51 | build-backend = "poetry.core.masonry.api" 52 | 53 | [tool.black] 54 | line-length = 72 55 | target-version = ['py312'] 56 | include = '\.pyi?$' 57 | skip-string-normalization = true 58 | preview = true 59 | extend-exclude = ''' 60 | /( 61 | \.git 62 | | \.venv 63 | )/ 64 | ''' 65 | 66 | [tool.isort] 67 | profile = "black" 68 | line_length = 72 69 | multi_line_output = 3 70 | include_trailing_comma = true 71 | force_grid_wrap = 0 72 | use_parentheses = true 73 | ensure_newline_before_comments = true 74 | 75 | [tool.pydocstyle] 76 | convention = "google" 77 | add_select = ["D417"] 78 | add_ignore = ["D100", "D104"] 79 | 80 | [tool.flake8] 81 | select = ["E9", "F63", "F7", "F82", "E722"] 82 | extend-ignore = ["E501", "F401", "W503", "E226", "E128", "F403", "F405", "E402", "E731", "F541"] 83 | max-complexity = 20 84 | exclude = [".git", "venv"] 85 | 86 | [tool.coverage.run] 87 | source = ["pyvisionai"] 88 | omit = ["tests/*", "examples/*"] 89 | 90 | [tool.coverage.report] 91 | exclude_lines = [ 92 | "pragma: no cover", 93 | "def __repr__", 94 | "if self.debug:", 95 | "raise NotImplementedError", 96 | "if __name__ == .__main__.:", 97 | "pass", 98 | "raise ImportError", 99 | ] 100 | show_missing = true 101 | fail_under = 80 102 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | testpaths = tests 3 | python_files = test_*.py 4 | python_classes = Test* 5 | python_functions = test_* 6 | addopts = 7 | --tb=short 8 | --quiet 9 | --no-header 10 | --disable-warnings 11 | -ra 12 | --log-cli-level=ERROR 13 | --log-file=tests/test.log 14 | --log-file-level=DEBUG 15 | --verbosity=0 16 | --show-capture=no 17 | log_cli_format = %(levelname)-8s %(name)s: %(message)s 18 | log_file_format = %(asctime)s [%(levelname)8s] %(name)s: %(message)s (%(filename)s:%(lineno)s) 19 | log_file_date_format = %Y-%m-%d %H:%M:%S 20 | 21 | # Disable specific loggers that are too verbose 22 | filterwarnings = 23 | ignore::DeprecationWarning 24 | ignore::UserWarning 25 | 26 | # Configure logging for specific packages 27 | log_cli_handler_level = ERROR 28 | logging_modules = 29 | httpcore=ERROR 30 | httpx=ERROR 31 | openai=ERROR 32 | anthropic=ERROR 33 | -------------------------------------------------------------------------------- /pyvisionai/__init__.py: -------------------------------------------------------------------------------- 1 | """PyVisionAI package.""" 2 | 3 | from typing import Optional 4 | 5 | from pyvisionai.core.factory import create_extractor 6 | from pyvisionai.describers.base import describe_image 7 | from pyvisionai.describers.claude import ClaudeVisionModel 8 | from pyvisionai.describers.ollama import describe_image_ollama 9 | from pyvisionai.describers.openai import describe_image_openai 10 | 11 | 12 | def describe_image_claude( 13 | image_path: str, 14 | api_key: Optional[str] = None, 15 | prompt: Optional[str] = None, 16 | **kwargs, 17 | ) -> str: 18 | """Describe an image using Claude Vision. 19 | 20 | Args: 21 | image_path: Path to the image file 22 | api_key: Anthropic API key (optional) 23 | prompt: Custom prompt for image description (optional) 24 | **kwargs: Additional arguments passed to the model 25 | 26 | Returns: 27 | str: Image description 28 | """ 29 | model = ClaudeVisionModel(api_key=api_key, prompt=prompt) 30 | return model.describe_image(image_path) 31 | 32 | 33 | __version__ = "0.1.0" 34 | __all__ = [ 35 | "create_extractor", 36 | "describe_image", 37 | "describe_image_ollama", 38 | "describe_image_openai", 39 | "describe_image_claude", 40 | ] 41 | -------------------------------------------------------------------------------- /pyvisionai/cli/__init__.py: -------------------------------------------------------------------------------- 1 | """Command-line interface tools.""" 2 | 3 | from .describe_image import describe_image_cli 4 | from .extract import process_directory, process_file 5 | 6 | __all__ = [ 7 | "describe_image_cli", 8 | "process_file", 9 | "process_directory", 10 | ] 11 | -------------------------------------------------------------------------------- /pyvisionai/cli/describe_image.py: -------------------------------------------------------------------------------- 1 | """Command-line interface for image description.""" 2 | 3 | import argparse 4 | import os 5 | from typing import Optional 6 | 7 | from pyvisionai.describers import ( 8 | describe_image_claude, 9 | describe_image_ollama, 10 | describe_image_openai, 11 | ) 12 | from pyvisionai.utils.config import DEFAULT_IMAGE_MODEL, DEFAULT_PROMPT 13 | from pyvisionai.utils.logger import logger 14 | 15 | 16 | def describe_image_cli( 17 | image_path: str, 18 | model: str = DEFAULT_IMAGE_MODEL, 19 | api_key: Optional[str] = None, 20 | verbose: bool = False, 21 | prompt: Optional[str] = None, 22 | ) -> str: 23 | """ 24 | Describe an image using the specified model. 25 | 26 | Args: 27 | image_path: Path to the image file 28 | model: Model to use (llama, gpt3, gpt4, or claude) 29 | api_key: API key (required for gpt3/gpt4/claude) 30 | verbose: Whether to print verbose output 31 | prompt: Custom prompt for image description (optional) 32 | 33 | Returns: 34 | str: Description of the image 35 | 36 | Note: 37 | - llama: Uses Ollama's llama3.2-vision model (local) 38 | - gpt3/gpt4: Uses OpenAI's gpt-4o-mini model (cloud) 39 | - claude: Uses Anthropic's Claude 3 Opus model (cloud) 40 | """ 41 | try: 42 | # Validate image path 43 | if not os.path.exists(image_path): 44 | raise FileNotFoundError( 45 | f"Image file not found: {image_path}" 46 | ) 47 | 48 | # Get description based on model 49 | if model == "llama": 50 | description = describe_image_ollama( 51 | image_path, 52 | prompt=prompt, 53 | ) 54 | elif model in ["gpt3", "gpt4"]: 55 | # Set OpenAI API key if provided 56 | if api_key: 57 | os.environ["OPENAI_API_KEY"] = api_key 58 | # Both GPT-3 and GPT-4 use cases use the same vision model 59 | description = describe_image_openai( 60 | image_path, 61 | api_key=api_key, 62 | prompt=prompt, 63 | ) 64 | elif model == "claude": 65 | # Set Anthropic API key if provided 66 | if api_key: 67 | os.environ["ANTHROPIC_API_KEY"] = api_key 68 | description = describe_image_claude( 69 | image_path, 70 | api_key=api_key, 71 | prompt=prompt, 72 | ) 73 | else: 74 | raise ValueError(f"Unsupported model: {model}") 75 | 76 | if verbose: 77 | print(f"\nDescription:\n{description}\n") 78 | 79 | return description 80 | 81 | except Exception as e: 82 | if verbose: 83 | logger.error(f"\nError: {str(e)}") 84 | raise 85 | 86 | 87 | def main(): 88 | """Main entry point for the CLI.""" 89 | parser = argparse.ArgumentParser( 90 | description="Describe an image using various models." 91 | ) 92 | 93 | # Source parameter group 94 | source_group = parser.add_mutually_exclusive_group(required=True) 95 | source_group.add_argument( 96 | "-s", 97 | "--source", 98 | help="Path to the image file to describe", 99 | ) 100 | source_group.add_argument( 101 | "-i", 102 | "--image", 103 | help="[Legacy] Path to the image file. For consistency with other commands, we recommend using -s/--source instead.", 104 | ) 105 | 106 | parser.add_argument( 107 | "-u", 108 | "--use-case", 109 | choices=["llama", "gpt3", "gpt4", "claude"], 110 | help="Legacy parameter for model selection. We recommend using --model for consistency, though --use-case remains supported for backward compatibility.", 111 | ) 112 | parser.add_argument( 113 | "-m", 114 | "--model", 115 | choices=["llama", "gpt3", "gpt4", "claude"], 116 | default=DEFAULT_IMAGE_MODEL, 117 | help="Model to use for description (gpt4: GPT-4 Vision, claude: Claude Vision, llama: Local Llama)", 118 | ) 119 | parser.add_argument( 120 | "-k", 121 | "--api-key", 122 | help="API key (required for GPT and Claude models)", 123 | ) 124 | parser.add_argument( 125 | "-v", 126 | "--verbose", 127 | action="store_true", 128 | help="Print verbose output", 129 | ) 130 | parser.add_argument( 131 | "-p", 132 | "--prompt", 133 | help=f"Custom prompt for image description (default: {DEFAULT_PROMPT})", 134 | ) 135 | 136 | args = parser.parse_args() 137 | 138 | try: 139 | # Determine which parameter was used and set image_path 140 | if args.image: 141 | image_path = args.image 142 | logger.warning( 143 | "For better consistency across commands, we recommend using -s/--source instead of -i/--image. " 144 | "Both parameters remain fully supported for backward compatibility." 145 | ) 146 | else: 147 | image_path = args.source 148 | 149 | # Handle model parameter precedence 150 | model = ( 151 | args.use_case if args.use_case is not None else args.model 152 | ) 153 | if args.use_case is not None: 154 | logger.warning( 155 | "For better consistency across commands, we recommend using -m/--model instead of -u/--use-case. " 156 | "Both parameters remain fully supported for backward compatibility." 157 | ) 158 | 159 | description = describe_image_cli( 160 | image_path, 161 | model, 162 | args.api_key, 163 | args.verbose, 164 | args.prompt, 165 | ) 166 | print(description) 167 | except Exception as e: 168 | logger.error(str(e)) 169 | exit(1) 170 | 171 | 172 | if __name__ == "__main__": 173 | main() 174 | -------------------------------------------------------------------------------- /pyvisionai/cli/extract.py: -------------------------------------------------------------------------------- 1 | """Command-line interface for file extraction.""" 2 | 3 | import argparse 4 | import os 5 | from typing import Optional 6 | 7 | from pyvisionai.core.factory import create_extractor 8 | from pyvisionai.utils.config import ( 9 | CONTENT_DIR, 10 | DEFAULT_IMAGE_MODEL, 11 | DEFAULT_PDF_EXTRACTOR, 12 | DEFAULT_PROMPT, 13 | EXTRACTED_DIR, 14 | SOURCE_DIR, 15 | ) 16 | from pyvisionai.utils.logger import logger 17 | 18 | 19 | def process_file( 20 | file_type: str, 21 | input_file: str, 22 | output_dir: str, 23 | extractor_type: Optional[str] = None, 24 | model: Optional[str] = None, 25 | api_key: Optional[str] = None, 26 | prompt: Optional[str] = None, 27 | ) -> str: 28 | """ 29 | Process a single file using the appropriate extractor. 30 | 31 | Args: 32 | file_type: Type of file to process ('pdf', 'docx', 'pptx') 33 | input_file: Path to the input file 34 | output_dir: Directory to save extracted content 35 | extractor_type: Optional specific extractor type 36 | model: Optional model to use for image descriptions 37 | api_key: Optional OpenAI API key (required for GPT-4) 38 | prompt: Optional custom prompt for image description 39 | 40 | Returns: 41 | str: Path to the output file 42 | """ 43 | try: 44 | # Create output directory if it doesn't exist 45 | os.makedirs(output_dir, exist_ok=True) 46 | 47 | # Create and use appropriate extractor 48 | extractor = create_extractor( 49 | file_type, extractor_type, model, api_key 50 | ) 51 | # Set custom prompt if provided 52 | if prompt: 53 | extractor.prompt = prompt 54 | return extractor.extract(input_file, output_dir) 55 | 56 | except Exception as e: 57 | logger.error(f"Error processing file: {str(e)}") 58 | raise 59 | 60 | 61 | def process_directory( 62 | file_type: str, 63 | input_dir: str, 64 | output_dir: str, 65 | extractor_type: Optional[str] = None, 66 | model: Optional[str] = None, 67 | api_key: Optional[str] = None, 68 | prompt: Optional[str] = None, 69 | ) -> None: 70 | """ 71 | Process all files of a given type in a directory. 72 | 73 | Args: 74 | file_type: Type of files to process ('pdf', 'docx', 'pptx') 75 | input_dir: Directory containing input files 76 | output_dir: Directory to save extracted content 77 | extractor_type: Optional specific extractor type 78 | model: Optional model to use for image descriptions 79 | api_key: Optional OpenAI API key (required for GPT-4) 80 | prompt: Optional custom prompt for image description 81 | """ 82 | try: 83 | # Create output directory if it doesn't exist 84 | os.makedirs(output_dir, exist_ok=True) 85 | 86 | # Process each file 87 | for filename in os.listdir(input_dir): 88 | if filename.lower().endswith(f".{file_type}"): 89 | input_file = os.path.join(input_dir, filename) 90 | logger.info(f"Processing {input_file}...") 91 | process_file( 92 | file_type, 93 | input_file, 94 | output_dir, 95 | extractor_type, 96 | model, 97 | api_key, 98 | prompt, 99 | ) 100 | 101 | except Exception as e: 102 | logger.error(f"Error processing directory: {str(e)}") 103 | raise 104 | 105 | 106 | def main(): 107 | """Main entry point for the CLI.""" 108 | parser = argparse.ArgumentParser( 109 | description="Extract content from various file types." 110 | ) 111 | parser.add_argument( 112 | "-t", 113 | "--type", 114 | choices=["pdf", "docx", "pptx", "html"], 115 | required=True, 116 | help="Type of file to process", 117 | ) 118 | parser.add_argument( 119 | "-s", 120 | "--source", 121 | default=SOURCE_DIR, 122 | help="Source file or directory", 123 | ) 124 | parser.add_argument( 125 | "-o", "--output", default=EXTRACTED_DIR, help="Output directory" 126 | ) 127 | parser.add_argument( 128 | "-e", 129 | "--extractor", 130 | choices=["text_and_images", "page_as_image"], 131 | default=DEFAULT_PDF_EXTRACTOR, 132 | help="Type of extractor to use", 133 | ) 134 | parser.add_argument( 135 | "-m", 136 | "--model", 137 | choices=["llama", "gpt4"], 138 | default=DEFAULT_IMAGE_MODEL, 139 | help="Model to use for image descriptions", 140 | ) 141 | parser.add_argument( 142 | "-k", "--api-key", help="OpenAI API key (required for GPT-4)" 143 | ) 144 | parser.add_argument( 145 | "-p", 146 | "--prompt", 147 | help=f"Custom prompt for image description (default: {DEFAULT_PROMPT})", 148 | ) 149 | 150 | args = parser.parse_args() 151 | 152 | try: 153 | # Determine if source is a file or directory 154 | if os.path.isfile(args.source): 155 | process_file( 156 | args.type, 157 | args.source, 158 | args.output, 159 | args.extractor, 160 | args.model, 161 | args.api_key, 162 | args.prompt, 163 | ) 164 | elif os.path.isdir(args.source): 165 | process_directory( 166 | args.type, 167 | args.source, 168 | args.output, 169 | args.extractor, 170 | args.model, 171 | args.api_key, 172 | args.prompt, 173 | ) 174 | else: 175 | raise FileNotFoundError(f"Source not found: {args.source}") 176 | 177 | except Exception as e: 178 | logger.error(str(e)) 179 | exit(1) 180 | 181 | 182 | if __name__ == "__main__": 183 | main() 184 | -------------------------------------------------------------------------------- /pyvisionai/config/html_config.py: -------------------------------------------------------------------------------- 1 | """HTML processing configuration.""" 2 | 3 | # Default configuration for HTML processing 4 | DEFAULT_CONFIG = { 5 | # Viewport settings 6 | "viewport": { 7 | "width": 1920, # Standard desktop width 8 | "height": 1080, # Full HD height 9 | "device_scale_factor": 1.0, 10 | }, 11 | # Timing settings 12 | "timeout": { 13 | "page_load": 30000, # 30s for initial page load 14 | "wait_for_idle": 5000, # 5s wait for network idle 15 | "render_delay": 1000, # 1s extra for final renders 16 | }, 17 | # Browser settings 18 | "browser": { 19 | "headless": True, 20 | "javascript_enabled": True, 21 | "user_agent": ( 22 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " 23 | "AppleWebKit/537.36 (KHTML, like Gecko) " 24 | "Chrome/120.0.0.0 Safari/537.36" 25 | ), 26 | }, 27 | # Screenshot settings 28 | "screenshot": { 29 | "full_page": True, # Capture entire page 30 | "quality": 90, # JPEG quality 31 | "format": "jpeg", # JPEG for better compression 32 | "optimize": True, # Apply image optimization 33 | }, 34 | # Content settings 35 | "content": { 36 | "wait_for_fonts": True, # Wait for web fonts 37 | "wait_for_images": True, # Wait for images to load 38 | "remove_ads": True, # Try to remove ad elements 39 | "max_height": 15000, # Prevent infinite scrolls (px) 40 | }, 41 | } 42 | 43 | # Common ad-related selectors to remove if remove_ads is True 44 | AD_SELECTORS = [ 45 | 'div[class*="ad-"]', 46 | 'div[class*="ads-"]', 47 | 'div[id*="google_ads"]', 48 | 'div[class*="banner"]', 49 | ".advertisement", 50 | "#advertisement", 51 | ] 52 | 53 | # Elements to wait for before taking screenshot 54 | WAIT_SELECTORS = [ 55 | "img", # Images 56 | "video", # Video elements 57 | "canvas", # Canvas elements 58 | "svg", # SVG graphics 59 | "@font-face", # Custom fonts 60 | ] 61 | -------------------------------------------------------------------------------- /pyvisionai/core/__init__.py: -------------------------------------------------------------------------------- 1 | """Core package.""" 2 | 3 | from pyvisionai.core.factory import create_extractor 4 | 5 | __all__ = ["create_extractor"] 6 | -------------------------------------------------------------------------------- /pyvisionai/core/extractor.py: -------------------------------------------------------------------------------- 1 | """Base class for all content extractors.""" 2 | 3 | from abc import ABC, abstractmethod 4 | 5 | 6 | class BaseExtractor(ABC): 7 | """Base class for document content extractors.""" 8 | 9 | @abstractmethod 10 | def extract(self, file_path: str, output_dir: str) -> str: 11 | """ 12 | Extract content from a document file. 13 | 14 | Args: 15 | file_path: Path to the document file to extract from 16 | output_dir: Directory to save extracted content 17 | 18 | Returns: 19 | str: Path to the generated markdown file 20 | """ 21 | pass 22 | -------------------------------------------------------------------------------- /pyvisionai/core/factory.py: -------------------------------------------------------------------------------- 1 | """Factory for creating extractors.""" 2 | 3 | from typing import Dict, Optional, Type 4 | 5 | from pyvisionai.extractors.base import BaseExtractor 6 | from pyvisionai.extractors.docx import DocxTextImageExtractor 7 | from pyvisionai.extractors.docx_page import DocxPageImageExtractor 8 | from pyvisionai.extractors.html_page import HtmlPageImageExtractor 9 | from pyvisionai.extractors.pdf import PDFTextImageExtractor 10 | from pyvisionai.extractors.pdf_page import PDFPageImageExtractor 11 | from pyvisionai.extractors.pptx import PptxTextImageExtractor 12 | from pyvisionai.extractors.pptx_page import PptxPageImageExtractor 13 | from pyvisionai.utils.config import DEFAULT_IMAGE_MODEL, DEFAULT_PROMPT 14 | 15 | # Map of file types to their extractors 16 | EXTRACTORS: Dict[str, Dict[str, Type[BaseExtractor]]] = { 17 | "pdf": { 18 | "text_and_images": PDFTextImageExtractor, 19 | "page_as_image": PDFPageImageExtractor, # Recommended for better results 20 | }, 21 | "docx": { 22 | "text_and_images": DocxTextImageExtractor, 23 | "page_as_image": DocxPageImageExtractor, # Use new page-as-image extractor 24 | }, 25 | "pptx": { 26 | "text_and_images": PptxTextImageExtractor, # Keep for now, will remove after testing 27 | "page_as_image": PptxPageImageExtractor, # Use new page-as-image extractor 28 | }, 29 | "html": { 30 | "page_as_image": HtmlPageImageExtractor, # Only page_as_image for HTML 31 | }, 32 | } 33 | 34 | 35 | def create_extractor( 36 | file_type: str, 37 | extractor_type: str = "page_as_image", 38 | model: str = DEFAULT_IMAGE_MODEL, 39 | api_key: Optional[str] = None, 40 | prompt: Optional[str] = None, 41 | ) -> BaseExtractor: 42 | """ 43 | Create an extractor instance based on file type and extraction method. 44 | 45 | Args: 46 | file_type: Type of file to process (pdf, docx, pptx) 47 | extractor_type: Type of extraction: 48 | - page_as_image (default): Convert each page to image (recommended) 49 | - text_and_images: Extract text and images separately 50 | model: Model to use for image descriptions (llama, gpt4) 51 | api_key: OpenAI API key (required for GPT-4) 52 | prompt: Custom prompt for image description (optional) 53 | 54 | Returns: 55 | BaseExtractor: An instance of the appropriate extractor 56 | """ 57 | if file_type not in EXTRACTORS: 58 | raise ValueError(f"Unsupported file type: {file_type}") 59 | if extractor_type not in EXTRACTORS[file_type]: 60 | raise ValueError( 61 | f"Unsupported extractor type: {extractor_type}" 62 | ) 63 | 64 | extractor_class = EXTRACTORS[file_type][extractor_type] 65 | extractor = extractor_class() 66 | extractor.model = model 67 | extractor.api_key = api_key 68 | extractor.prompt = prompt or DEFAULT_PROMPT 69 | return extractor 70 | -------------------------------------------------------------------------------- /pyvisionai/describers/__init__.py: -------------------------------------------------------------------------------- 1 | """Image description functions.""" 2 | 3 | from typing import Optional 4 | 5 | from .base import ModelFactory, VisionModel, describe_image 6 | from .claude import ClaudeVisionModel 7 | from .ollama import LlamaVisionModel, describe_image_ollama 8 | from .openai import GPT4VisionModel, describe_image_openai 9 | 10 | # Register models with the factory 11 | ModelFactory.register_model("llama", LlamaVisionModel) 12 | ModelFactory.register_model("gpt4", GPT4VisionModel) 13 | ModelFactory.register_model("claude", ClaudeVisionModel) 14 | 15 | 16 | def describe_image_claude( 17 | image_path: str, 18 | api_key: Optional[str] = None, 19 | prompt: Optional[str] = None, 20 | ) -> str: 21 | """Describe an image using Claude Vision. 22 | 23 | Args: 24 | image_path: Path to the image file 25 | api_key: Anthropic API key (optional) 26 | prompt: Custom prompt for image description (optional) 27 | 28 | Returns: 29 | str: Image description 30 | """ 31 | model = ClaudeVisionModel(api_key=api_key, prompt=prompt) 32 | return model.describe_image(image_path) 33 | 34 | 35 | __all__ = [ 36 | "describe_image", 37 | "describe_image_ollama", 38 | "describe_image_openai", 39 | "describe_image_claude", 40 | "VisionModel", 41 | "ModelFactory", 42 | "LlamaVisionModel", 43 | "GPT4VisionModel", 44 | ] 45 | -------------------------------------------------------------------------------- /pyvisionai/describers/base.py: -------------------------------------------------------------------------------- 1 | """Base image description functionality.""" 2 | 3 | import logging 4 | import os 5 | from abc import ABC, abstractmethod 6 | from typing import Dict, Optional, Type 7 | 8 | from ..utils.config import DEFAULT_IMAGE_MODEL 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | class VisionModel(ABC): 14 | """Base class for vision models.""" 15 | 16 | def __init__( 17 | self, 18 | api_key: Optional[str] = None, 19 | prompt: Optional[str] = None, 20 | ): 21 | """Initialize the model.""" 22 | self.api_key = api_key 23 | self.prompt = prompt 24 | logger.debug(f"Initializing {self.__class__.__name__}") 25 | 26 | @abstractmethod 27 | def describe_image(self, image_path: str) -> str: 28 | """Describe an image using this model.""" 29 | pass 30 | 31 | def validate_config(self) -> None: 32 | """Validate the model configuration.""" 33 | try: 34 | self._validate_config() 35 | logger.debug("Configuration validation successful") 36 | except Exception as e: 37 | logger.error("Configuration validation failed") 38 | raise 39 | 40 | @abstractmethod 41 | def _validate_config(self) -> None: 42 | """Internal validation implementation.""" 43 | pass 44 | 45 | 46 | class ModelFactory: 47 | """Factory for creating vision models.""" 48 | 49 | _models: Dict[str, Type[VisionModel]] = {} 50 | 51 | @classmethod 52 | def register_model( 53 | cls, name: str, model_class: Type[VisionModel] 54 | ) -> None: 55 | """Register a model with the factory.""" 56 | logger.info(f"Registering model type: {name}") 57 | cls._models[name] = model_class 58 | 59 | @classmethod 60 | def create_model( 61 | cls, 62 | model_type: Optional[str] = None, 63 | api_key: Optional[str] = None, 64 | prompt: Optional[str] = None, 65 | ) -> VisionModel: 66 | """Create a model instance.""" 67 | try: 68 | if model_type not in cls._models: 69 | raise ValueError( 70 | f"Unsupported model type: {model_type}" 71 | ) 72 | 73 | model = cls._models[model_type]( 74 | api_key=api_key, prompt=prompt 75 | ) 76 | logger.debug("Model creation successful") 77 | return model 78 | except Exception as e: 79 | logger.error("Model creation failed") 80 | raise 81 | 82 | 83 | def describe_image(image_path: str, model: Optional[str] = None) -> str: 84 | """ 85 | Describe the contents of an image using the specified model. 86 | 87 | Args: 88 | image_path: Path to the image file 89 | model: Optional model name to use for description (default: uses configured default) 90 | 91 | Returns: 92 | str: Description of the image 93 | """ 94 | # Use configured default if no model specified 95 | model_type = model or DEFAULT_IMAGE_MODEL 96 | 97 | # Try the specified/default model first 98 | try: 99 | model_instance = ModelFactory.create_model( 100 | model_type=model_type 101 | ) 102 | return model_instance.describe_image(image_path) 103 | except (ConnectionError, ConnectionRefusedError) as e: 104 | logger.warning(f"Failed to connect to {model_type}: {str(e)}") 105 | 106 | # If the default model fails, try other available models 107 | if ( 108 | not model 109 | ): # Only try alternatives if no specific model was requested 110 | for alt_model in ModelFactory._models.keys(): 111 | if alt_model != model_type: 112 | try: 113 | logger.info( 114 | f"Attempting to use alternative model: {alt_model}" 115 | ) 116 | model_instance = ModelFactory.create_model( 117 | model_type=alt_model 118 | ) 119 | return model_instance.describe_image(image_path) 120 | except ( 121 | ConnectionError, 122 | ConnectionRefusedError, 123 | ) as e: 124 | logger.warning( 125 | f"Failed to connect to {alt_model}: {str(e)}" 126 | ) 127 | continue 128 | except Exception as e: 129 | logger.error( 130 | f"Error using {alt_model}: {str(e)}" 131 | ) 132 | continue 133 | 134 | # If we get here, either a specific model was requested or all alternatives failed 135 | raise ConnectionError( 136 | f"Failed to connect to {model_type} and no working alternatives found" 137 | ) 138 | -------------------------------------------------------------------------------- /pyvisionai/describers/claude.py: -------------------------------------------------------------------------------- 1 | """Claude Vision model for image description.""" 2 | 3 | import base64 4 | from typing import Optional 5 | from unittest.mock import MagicMock 6 | 7 | from anthropic import Anthropic, APIError, AuthenticationError 8 | 9 | from pyvisionai.describers.base import VisionModel 10 | from pyvisionai.utils.config import DEFAULT_PROMPT 11 | from pyvisionai.utils.retry import ( 12 | ConnectionError, 13 | RetryManager, 14 | RetryStrategy, 15 | ) 16 | 17 | 18 | def create_api_error(message: str) -> APIError: 19 | """Create an APIError with the required arguments.""" 20 | return APIError( 21 | message=message, 22 | request=MagicMock(), 23 | body={"error": {"message": message}}, 24 | ) 25 | 26 | 27 | class ClaudeVisionModel(VisionModel): 28 | """Claude Vision model for image description.""" 29 | 30 | def __init__( 31 | self, 32 | api_key: Optional[str] = None, 33 | prompt: Optional[str] = None, 34 | ): 35 | """Initialize the Claude Vision model. 36 | 37 | Args: 38 | api_key: Anthropic API key (optional) 39 | prompt: Custom prompt for image description (optional) 40 | """ 41 | super().__init__(api_key=api_key, prompt=prompt) 42 | self.client = None 43 | self.retry_manager = RetryManager( 44 | max_attempts=3, # Initial attempt + 2 retries to match tests 45 | strategy=RetryStrategy.EXPONENTIAL, 46 | base_delay=1.0, 47 | max_delay=10.0, 48 | ) 49 | 50 | def _validate_config(self) -> None: 51 | """Validate the model configuration.""" 52 | if not self.api_key: 53 | raise ValueError("Anthropic API key is required") 54 | if not self.client: 55 | self.client = Anthropic(api_key=self.api_key) 56 | 57 | def describe_image(self, image_path: str) -> str: 58 | """Describe an image using Claude Vision. 59 | 60 | Args: 61 | image_path: Path to the image file 62 | 63 | Returns: 64 | str: Image description 65 | 66 | Raises: 67 | ValueError: If the configuration is invalid or no description generated 68 | ConnectionError: If API connection fails 69 | RuntimeError: For other errors 70 | """ 71 | self.validate_config() 72 | 73 | def _call_api(): 74 | with open(image_path, "rb") as f: 75 | image_data = f.read() 76 | 77 | effective_prompt = self.prompt or DEFAULT_PROMPT 78 | response = self.client.messages.create( 79 | model="claude-3-opus-20240229", 80 | max_tokens=1024, 81 | messages=[ 82 | { 83 | "role": "user", 84 | "content": [ 85 | {"type": "text", "text": effective_prompt}, 86 | { 87 | "type": "image", 88 | "source": { 89 | "type": "base64", 90 | "media_type": "image/jpeg", 91 | "data": base64.b64encode( 92 | image_data 93 | ).decode(), 94 | }, 95 | }, 96 | ], 97 | } 98 | ], 99 | ) 100 | 101 | # More defensive response validation 102 | if ( 103 | not response 104 | or not hasattr(response, 'content') 105 | or not response.content 106 | or not isinstance(response.content, list) 107 | or not response.content[0] 108 | or not hasattr(response.content[0], 'text') 109 | ): 110 | raise ValueError("No description generated") 111 | 112 | text = response.content[0].text 113 | if not text or not text.strip(): 114 | raise ValueError("No description generated") 115 | 116 | return text.strip() 117 | 118 | try: 119 | return self.retry_manager.execute(_call_api) 120 | except APIError as e: 121 | error_msg = str(e).lower() 122 | if "authentication" in error_msg or "401" in error_msg: 123 | raise ConnectionError( 124 | f"Authentication failed: {str(e)}" 125 | ) 126 | # Let retry manager handle rate limits and server errors 127 | raise 128 | except (ValueError, ConnectionError) as e: 129 | # Re-raise these errors directly 130 | raise 131 | except Exception as e: 132 | raise RuntimeError( 133 | f"Error describing image with Claude: {str(e)}" 134 | ) 135 | -------------------------------------------------------------------------------- /pyvisionai/describers/ollama.py: -------------------------------------------------------------------------------- 1 | """Image description using Ollama's Llama3.2 Vision model.""" 2 | 3 | import base64 4 | from typing import Optional 5 | 6 | import requests 7 | 8 | from ..utils.config import DEFAULT_PROMPT, OLLAMA_MODEL_NAME 9 | from ..utils.logger import logger 10 | from ..utils.retry import RetryManager, RetryStrategy 11 | from .base import VisionModel 12 | 13 | 14 | class LlamaVisionModel(VisionModel): 15 | """Llama Vision model implementation.""" 16 | 17 | def __init__( 18 | self, 19 | api_key: Optional[str] = None, 20 | prompt: Optional[str] = None, 21 | ): 22 | super().__init__(api_key=api_key, prompt=prompt) 23 | self.model_name = OLLAMA_MODEL_NAME 24 | # Initialize retry manager with exponential backoff 25 | self.retry_manager = RetryManager( 26 | max_attempts=3, 27 | strategy=RetryStrategy.EXPONENTIAL, 28 | base_delay=1.0, 29 | max_delay=10.0, 30 | logger=logger, 31 | ) 32 | 33 | def describe_image(self, image_path: str) -> str: 34 | """Describe an image using Ollama's Llama3.2 Vision model.""" 35 | 36 | def _make_request(): 37 | # Read and encode image 38 | with open(image_path, "rb") as image_file: 39 | image_data = base64.b64encode( 40 | image_file.read() 41 | ).decode() 42 | 43 | # Use default prompt if none provided 44 | prompt = self.prompt or DEFAULT_PROMPT 45 | 46 | # Prepare request 47 | url = "http://localhost:11434/api/generate" 48 | payload = { 49 | "model": self.model_name, 50 | "prompt": prompt, 51 | "stream": False, 52 | "images": [image_data], 53 | } 54 | 55 | # Make request 56 | response = requests.post(url, json=payload) 57 | response.raise_for_status() 58 | 59 | # Extract description 60 | result = response.json() 61 | description = result.get("response", "").strip() 62 | 63 | if not description: 64 | raise ValueError("No description generated") 65 | 66 | return description 67 | 68 | try: 69 | # Execute with retry 70 | return self.retry_manager.execute(_make_request) 71 | except requests.exceptions.ConnectionError as e: 72 | error_msg = str(e) 73 | logger.error( 74 | f"Error describing image with Ollama: {error_msg}" 75 | ) 76 | raise ConnectionError(error_msg) 77 | except Exception as e: 78 | logger.error( 79 | f"Error describing image with Ollama: {str(e)}" 80 | ) 81 | raise 82 | 83 | def _validate_config(self) -> None: 84 | """Validate Ollama configuration.""" 85 | # No API key required for Ollama 86 | pass 87 | 88 | 89 | def describe_image_ollama( 90 | image_path: str, 91 | model: Optional[str] = None, 92 | prompt: Optional[str] = None, 93 | ) -> str: 94 | """ 95 | Describe an image using Ollama's Llama3.2 Vision model. 96 | 97 | Args: 98 | image_path: Path to the image file 99 | model: Name of the Ollama model to use (default: llama3.2-vision) 100 | prompt: Custom prompt for image description (optional) 101 | 102 | Returns: 103 | str: Description of the image 104 | """ 105 | try: 106 | # Read and encode image 107 | with open(image_path, "rb") as image_file: 108 | image_data = base64.b64encode(image_file.read()).decode() 109 | 110 | # Use default prompt if none provided 111 | prompt = prompt or DEFAULT_PROMPT 112 | # Use default model if none provided 113 | model = model or OLLAMA_MODEL_NAME 114 | 115 | # Prepare request 116 | url = "http://localhost:11434/api/generate" 117 | payload = { 118 | "model": model, 119 | "prompt": prompt, 120 | "stream": False, 121 | "images": [image_data], 122 | } 123 | 124 | # Make request 125 | response = requests.post(url, json=payload) 126 | response.raise_for_status() 127 | 128 | # Extract description 129 | result = response.json() 130 | description = result.get("response", "").strip() 131 | 132 | if not description: 133 | raise ValueError("No description generated") 134 | 135 | return description 136 | 137 | except Exception as e: 138 | logger.error(f"Error describing image with Ollama: {str(e)}") 139 | raise 140 | -------------------------------------------------------------------------------- /pyvisionai/describers/openai.py: -------------------------------------------------------------------------------- 1 | """Image description using OpenAI's GPT-4 Vision model.""" 2 | 3 | import base64 4 | from typing import Optional 5 | 6 | from openai import OpenAI 7 | 8 | from ..utils.config import DEFAULT_PROMPT, OPENAI_MODEL_NAME 9 | from ..utils.logger import logger 10 | from ..utils.retry import RetryManager, RetryStrategy 11 | from .base import VisionModel 12 | 13 | 14 | class GPT4VisionModel(VisionModel): 15 | """GPT-4 Vision model implementation.""" 16 | 17 | def __init__( 18 | self, 19 | api_key: Optional[str] = None, 20 | prompt: Optional[str] = None, 21 | ): 22 | super().__init__(api_key=api_key, prompt=prompt) 23 | self.max_tokens = 300 24 | self.model_name = OPENAI_MODEL_NAME 25 | # Initialize retry manager with exponential backoff 26 | self.retry_manager = RetryManager( 27 | max_attempts=3, 28 | strategy=RetryStrategy.EXPONENTIAL, 29 | base_delay=1.0, 30 | max_delay=10.0, 31 | logger=logger, 32 | ) 33 | 34 | def describe_image(self, image_path: str) -> str: 35 | """Describe an image using OpenAI's GPT-4 Vision model.""" 36 | 37 | def _make_request(): 38 | # Initialize client 39 | client = OpenAI(api_key=self.api_key) 40 | 41 | # Read and encode image 42 | with open(image_path, "rb") as image_file: 43 | image_data = base64.b64encode( 44 | image_file.read() 45 | ).decode() 46 | 47 | # Use default prompt if none provided 48 | prompt = self.prompt or DEFAULT_PROMPT 49 | 50 | # Prepare request 51 | response = client.chat.completions.create( 52 | model=self.model_name, 53 | messages=[ 54 | { 55 | "role": "user", 56 | "content": [ 57 | { 58 | "type": "text", 59 | "text": prompt, 60 | }, 61 | { 62 | "type": "image_url", 63 | "image_url": { 64 | "url": f"data:image/jpeg;base64,{image_data}" 65 | }, 66 | }, 67 | ], 68 | } 69 | ], 70 | max_tokens=self.max_tokens, 71 | ) 72 | 73 | # Extract description 74 | description = response.choices[0].message.content.strip() 75 | 76 | if not description: 77 | raise ValueError("No description generated") 78 | 79 | return description 80 | 81 | try: 82 | # Execute with retry 83 | return self.retry_manager.execute(_make_request) 84 | except Exception as e: 85 | logger.error( 86 | f"Error describing image with OpenAI: {str(e)}" 87 | ) 88 | raise 89 | 90 | def _validate_config(self) -> None: 91 | """Validate OpenAI configuration.""" 92 | if not self.api_key: 93 | raise ValueError("OpenAI API key is required") 94 | 95 | 96 | def describe_image_openai( 97 | image_path: str, 98 | model: Optional[str] = None, 99 | api_key: Optional[str] = None, 100 | max_tokens: int = 300, 101 | prompt: Optional[str] = None, 102 | ) -> str: 103 | """ 104 | Describe an image using OpenAI's GPT-4 Vision model. 105 | 106 | Args: 107 | image_path: Path to the image file 108 | model: Name of the OpenAI model to use (default: gpt-4o-mini) 109 | api_key: OpenAI API key (optional if set in environment) 110 | max_tokens: Maximum tokens in the response 111 | prompt: Custom prompt for image description (optional) 112 | 113 | Returns: 114 | str: Description of the image 115 | """ 116 | try: 117 | # Initialize client 118 | client = OpenAI(api_key=api_key) 119 | 120 | # Read and encode image 121 | with open(image_path, "rb") as image_file: 122 | image_data = base64.b64encode(image_file.read()).decode() 123 | 124 | # Use default prompt if none provided 125 | prompt = prompt or DEFAULT_PROMPT 126 | # Use default model if none provided 127 | model = model or OPENAI_MODEL_NAME 128 | 129 | # Prepare request 130 | response = client.chat.completions.create( 131 | model=model, 132 | messages=[ 133 | { 134 | "role": "user", 135 | "content": [ 136 | { 137 | "type": "text", 138 | "text": prompt, 139 | }, 140 | { 141 | "type": "image_url", 142 | "image_url": { 143 | "url": f"data:image/jpeg;base64,{image_data}" 144 | }, 145 | }, 146 | ], 147 | } 148 | ], 149 | max_tokens=max_tokens, 150 | ) 151 | 152 | # Extract description 153 | description = response.choices[0].message.content.strip() 154 | 155 | if not description: 156 | raise ValueError("No description generated") 157 | 158 | return description 159 | 160 | except Exception as e: 161 | logger.error(f"Error describing image with OpenAI: {str(e)}") 162 | raise 163 | -------------------------------------------------------------------------------- /pyvisionai/extractors/__init__.py: -------------------------------------------------------------------------------- 1 | """Extractors package.""" 2 | 3 | from pyvisionai.extractors.docx import DocxTextImageExtractor 4 | from pyvisionai.extractors.docx_page import DocxPageImageExtractor 5 | from pyvisionai.extractors.html_page import HtmlPageImageExtractor 6 | from pyvisionai.extractors.pdf import PDFTextImageExtractor 7 | from pyvisionai.extractors.pdf_page import PDFPageImageExtractor 8 | from pyvisionai.extractors.pptx import PptxTextImageExtractor 9 | from pyvisionai.extractors.pptx_page import PptxPageImageExtractor 10 | 11 | __all__ = [ 12 | "PDFTextImageExtractor", 13 | "PDFPageImageExtractor", 14 | "DocxTextImageExtractor", 15 | "DocxPageImageExtractor", 16 | "PptxTextImageExtractor", 17 | "PptxPageImageExtractor", 18 | "HtmlPageImageExtractor", 19 | ] 20 | -------------------------------------------------------------------------------- /pyvisionai/extractors/base.py: -------------------------------------------------------------------------------- 1 | """Base class for all extractors.""" 2 | 3 | from abc import ABC, abstractmethod 4 | from typing import Optional 5 | 6 | from pyvisionai.describers import ( 7 | describe_image_ollama, 8 | describe_image_openai, 9 | ) 10 | from pyvisionai.utils.config import ( 11 | DEFAULT_IMAGE_MODEL, 12 | DEFAULT_PROMPT, 13 | OPENAI_MODEL_NAME, 14 | ) 15 | 16 | 17 | class BaseExtractor(ABC): 18 | """Base class for all extractors.""" 19 | 20 | def __init__(self): 21 | """Initialize the extractor.""" 22 | self.model = DEFAULT_IMAGE_MODEL 23 | self.api_key = None 24 | self.prompt = DEFAULT_PROMPT 25 | 26 | def describe_image(self, image_path: str) -> str: 27 | """ 28 | Describe an image using the configured model. 29 | 30 | Args: 31 | image_path: Path to the image file 32 | 33 | Returns: 34 | str: Description of the image 35 | """ 36 | if self.model == "llama": 37 | return describe_image_ollama( 38 | image_path, 39 | prompt=self.prompt, 40 | ) 41 | elif self.model == "gpt4": 42 | return describe_image_openai( 43 | image_path, 44 | model=OPENAI_MODEL_NAME, 45 | api_key=self.api_key, 46 | prompt=self.prompt, 47 | ) 48 | else: 49 | raise ValueError(f"Unsupported model: {self.model}") 50 | 51 | @abstractmethod 52 | def extract(self, input_file: str, output_dir: str) -> str: 53 | """ 54 | Extract content from a file. 55 | 56 | Args: 57 | input_file: Path to the input file 58 | output_dir: Directory to save extracted content 59 | 60 | Returns: 61 | str: Path to the generated markdown file 62 | """ 63 | pass 64 | -------------------------------------------------------------------------------- /pyvisionai/extractors/docx.py: -------------------------------------------------------------------------------- 1 | """DOCX content extractor.""" 2 | 3 | import concurrent.futures 4 | import io 5 | import os 6 | from dataclasses import dataclass 7 | from typing import List, Tuple 8 | 9 | from docx import Document 10 | from PIL import Image 11 | 12 | from pyvisionai.extractors.base import BaseExtractor 13 | from pyvisionai.utils.logger import logger 14 | 15 | 16 | @dataclass 17 | class ImageTask: 18 | """Container for image processing task data.""" 19 | 20 | image_data: bytes 21 | image_name: str 22 | output_dir: str 23 | index: int 24 | 25 | 26 | class DocxTextImageExtractor(BaseExtractor): 27 | """Extract text and images from DOCX files.""" 28 | 29 | def extract_text_and_images( 30 | self, docx_path: str 31 | ) -> Tuple[List[str], List[bytes]]: 32 | """Extract text and images from DOCX file.""" 33 | doc = Document(docx_path) 34 | paragraphs = [] 35 | images = [] 36 | 37 | # Extract text from paragraphs 38 | for paragraph in doc.paragraphs: 39 | if paragraph.text.strip(): 40 | paragraphs.append(paragraph.text) 41 | 42 | # Extract images from relationships 43 | rels = doc.part.rels 44 | for rel in rels.values(): 45 | if "image" in rel.target_ref: 46 | try: 47 | image_data = rel.target_part.blob 48 | images.append(image_data) 49 | except Exception as e: 50 | logger.error(f"Error extracting image: {str(e)}") 51 | continue 52 | 53 | return paragraphs, images 54 | 55 | def save_image( 56 | self, image_data: bytes, output_dir: str, image_name: str 57 | ) -> str: 58 | """Save an image to the output directory.""" 59 | try: 60 | # Convert image data to PIL Image 61 | image = Image.open(io.BytesIO(image_data)) 62 | # Convert to RGB if necessary 63 | if image.mode != "RGB": 64 | image = image.convert("RGB") 65 | # Save as JPEG (supported format) 66 | img_path = os.path.join(output_dir, f"{image_name}.jpg") 67 | image.save(img_path, "JPEG", quality=95) 68 | return img_path 69 | except Exception as e: 70 | logger.error(f"Error saving image: {str(e)}") 71 | raise 72 | 73 | def process_image_task(self, task: ImageTask) -> tuple[int, str]: 74 | """Process a single image task.""" 75 | try: 76 | img_path = self.save_image( 77 | task.image_data, task.output_dir, task.image_name 78 | ) 79 | image_description = self.describe_image(img_path) 80 | os.remove(img_path) # Clean up 81 | return task.index, image_description 82 | except Exception as e: 83 | logger.error( 84 | f"Error processing image {task.image_name}: {str(e)}" 85 | ) 86 | return ( 87 | task.index, 88 | f"Error: Could not process image {task.image_name}", 89 | ) 90 | 91 | def extract(self, docx_path: str, output_dir: str) -> str: 92 | """Process DOCX file by extracting text and images separately.""" 93 | try: 94 | docx_filename = os.path.splitext( 95 | os.path.basename(docx_path) 96 | )[0] 97 | 98 | # Extract text and images 99 | paragraphs, images = self.extract_text_and_images(docx_path) 100 | 101 | # Generate markdown content 102 | md_content = f"# {docx_filename}\n\n" 103 | 104 | # Add text content 105 | for paragraph in paragraphs: 106 | md_content += f"{paragraph}\n\n" 107 | 108 | # Prepare image tasks 109 | image_tasks = [] 110 | for img_index, img_data in enumerate(images): 111 | image_name = f"{docx_filename}_image_{img_index + 1}" 112 | task = ImageTask( 113 | image_data=img_data, 114 | image_name=image_name, 115 | output_dir=output_dir, 116 | index=img_index, 117 | ) 118 | image_tasks.append(task) 119 | 120 | # Process images in parallel if there are any 121 | if image_tasks: 122 | # Store descriptions in order 123 | descriptions = [""] * len(image_tasks) 124 | 125 | # Use ThreadPoolExecutor for parallel processing 126 | with concurrent.futures.ThreadPoolExecutor( 127 | max_workers=4 128 | ) as executor: 129 | # Submit all tasks 130 | future_to_task = { 131 | executor.submit( 132 | self.process_image_task, task 133 | ): task 134 | for task in image_tasks 135 | } 136 | 137 | # Collect results as they complete 138 | for future in concurrent.futures.as_completed( 139 | future_to_task 140 | ): 141 | idx, description = future.result() 142 | descriptions[idx] = description 143 | 144 | # Add descriptions to markdown in correct order 145 | for img_index, description in enumerate(descriptions): 146 | md_content += f"[Image {img_index + 1}]\n" 147 | md_content += f"Description: {description}\n\n" 148 | 149 | # Save markdown file 150 | md_file_path = os.path.join( 151 | output_dir, f"{docx_filename}_docx.md" 152 | ) 153 | with open(md_file_path, "w", encoding="utf-8") as md_file: 154 | md_file.write(md_content) 155 | 156 | # Add info logging 157 | logger.info("Processing DOCX file...") 158 | logger.info(f"Extracted {len(images)} images") 159 | logger.info("DOCX processing completed successfully") 160 | 161 | return md_file_path 162 | 163 | except Exception as e: 164 | logger.error(f"Error processing DOCX: {str(e)}") 165 | raise 166 | -------------------------------------------------------------------------------- /pyvisionai/extractors/docx_page.py: -------------------------------------------------------------------------------- 1 | """DOCX page-as-image extractor.""" 2 | 3 | import concurrent.futures 4 | import logging 5 | import os 6 | import subprocess 7 | import tempfile 8 | import time 9 | from dataclasses import dataclass 10 | from typing import Tuple 11 | 12 | from PIL import Image 13 | 14 | from pyvisionai.extractors.base import BaseExtractor 15 | 16 | logger = logging.getLogger(__name__) 17 | 18 | 19 | @dataclass 20 | class PageTask: 21 | """Task for processing a single page.""" 22 | 23 | index: int 24 | image: Image.Image 25 | output_dir: str 26 | image_name: str 27 | 28 | 29 | class DocxPageImageExtractor(BaseExtractor): 30 | """Extract content from DOCX files by converting pages to images.""" 31 | 32 | def convert_to_pdf(self, docx_path: str) -> str: 33 | """Convert DOCX to PDF using LibreOffice.""" 34 | try: 35 | # Create a temporary directory for the PDF 36 | temp_dir = tempfile.mkdtemp() 37 | 38 | # Get absolute paths 39 | abs_docx_path = os.path.abspath(docx_path) 40 | abs_temp_dir = os.path.abspath(temp_dir) 41 | 42 | # The output PDF will have the same name as the input DOCX 43 | docx_filename = os.path.splitext( 44 | os.path.basename(docx_path) 45 | )[0] 46 | pdf_path = os.path.join( 47 | abs_temp_dir, f"{docx_filename}.pdf" 48 | ) 49 | 50 | # Convert DOCX to PDF using LibreOffice 51 | cmd = [ 52 | "soffice", 53 | "--headless", 54 | "--convert-to", 55 | "pdf", 56 | "--outdir", 57 | abs_temp_dir, 58 | abs_docx_path, 59 | ] 60 | 61 | # Run the command and capture output 62 | result = subprocess.run( 63 | cmd, check=True, capture_output=True, text=True 64 | ) 65 | 66 | # Wait a moment for the file to be written 67 | time.sleep(1) 68 | 69 | # Verify the PDF was created 70 | if not os.path.exists(pdf_path): 71 | raise FileNotFoundError( 72 | f"PDF file was not created. LibreOffice output:\n" 73 | f"STDOUT: {result.stdout}\n" 74 | f"STDERR: {result.stderr}" 75 | ) 76 | 77 | return pdf_path 78 | 79 | except subprocess.CalledProcessError as e: 80 | raise RuntimeError( 81 | f"LibreOffice conversion failed:\n" 82 | f"STDOUT: {e.stdout}\n" 83 | f"STDERR: {e.stderr}" 84 | ) from e 85 | except Exception as e: 86 | raise RuntimeError( 87 | f"Failed to convert DOCX to PDF: {str(e)}" 88 | ) from e 89 | 90 | def convert_pages_to_images(self, pdf_path: str) -> list: 91 | """Convert PDF pages to images using pdf2image.""" 92 | from pdf2image import convert_from_path 93 | 94 | return convert_from_path(pdf_path, dpi=300) 95 | 96 | def save_image( 97 | self, image: Image.Image, output_dir: str, image_name: str 98 | ) -> str: 99 | """Save an image to the output directory.""" 100 | img_path = os.path.join(output_dir, f"{image_name}.jpg") 101 | image.save(img_path, "JPEG", quality=95) 102 | return img_path 103 | 104 | def process_page(self, task: PageTask) -> Tuple[int, str]: 105 | """Process a single page.""" 106 | try: 107 | # Save page image 108 | img_path = self.save_image( 109 | task.image, task.output_dir, task.image_name 110 | ) 111 | 112 | # Get page description using configured model 113 | page_description = self.describe_image(img_path) 114 | 115 | # Clean up image file 116 | os.remove(img_path) 117 | 118 | return task.index, page_description 119 | except Exception as e: 120 | logger.error( 121 | f"Error processing page {task.image_name}: {str(e)}" 122 | ) 123 | return ( 124 | task.index, 125 | f"Error: Could not process page {task.image_name}", 126 | ) 127 | 128 | def extract(self, docx_path: str, output_dir: str) -> str: 129 | """Process DOCX file by converting each page to an image.""" 130 | try: 131 | logger.info("Processing DOCX file...") 132 | 133 | docx_filename = os.path.splitext( 134 | os.path.basename(docx_path) 135 | )[0] 136 | 137 | # Create temporary directory for page images 138 | pages_dir = os.path.join( 139 | output_dir, f"{docx_filename}_pages" 140 | ) 141 | if not os.path.exists(pages_dir): 142 | os.makedirs(pages_dir) 143 | 144 | # Convert DOCX to PDF first 145 | pdf_path = self.convert_to_pdf(docx_path) 146 | logger.info("Converted DOCX to PDF") 147 | 148 | # Convert PDF pages to images 149 | images = self.convert_pages_to_images(pdf_path) 150 | logger.info(f"Converting {len(images)} pages to images") 151 | 152 | # Generate markdown content 153 | md_content = f"# {docx_filename}\n\n" 154 | 155 | # Create page tasks 156 | page_tasks = [] 157 | for page_num, image in enumerate(images): 158 | image_name = f"page_{page_num + 1}" 159 | task = PageTask( 160 | index=page_num, 161 | image=image, 162 | output_dir=pages_dir, 163 | image_name=image_name, 164 | ) 165 | page_tasks.append(task) 166 | 167 | # Process pages in parallel 168 | descriptions = [""] * len(images) 169 | with concurrent.futures.ThreadPoolExecutor( 170 | max_workers=4 171 | ) as executor: 172 | # Submit all tasks 173 | future_to_page = { 174 | executor.submit(self.process_page, task): task.index 175 | for task in page_tasks 176 | } 177 | 178 | # Collect results as they complete 179 | for future in concurrent.futures.as_completed( 180 | future_to_page 181 | ): 182 | page_num, description = future.result() 183 | descriptions[page_num] = description 184 | 185 | # Add descriptions to markdown in correct order 186 | for page_num, description in enumerate(descriptions): 187 | md_content += f"## Page {page_num + 1}\n\n" 188 | md_content += f"[Image {page_num + 1}]\n" 189 | md_content += f"Description: {description}\n\n" 190 | 191 | # Save markdown file 192 | md_file_path = os.path.join( 193 | output_dir, f"{docx_filename}_docx.md" 194 | ) 195 | with open(md_file_path, "w", encoding="utf-8") as md_file: 196 | md_file.write(md_content) 197 | 198 | # Clean up temporary files 199 | os.remove(pdf_path) 200 | os.rmdir( 201 | os.path.dirname(pdf_path) 202 | ) # Remove temp PDF directory 203 | os.rmdir(pages_dir) # Remove pages directory 204 | 205 | logger.info("DOCX processing completed successfully") 206 | return md_file_path 207 | 208 | except Exception as e: 209 | logger.error(f"Error processing DOCX: {str(e)}") 210 | raise 211 | -------------------------------------------------------------------------------- /pyvisionai/extractors/html/browser.py: -------------------------------------------------------------------------------- 1 | """Browser management for HTML processing.""" 2 | 3 | import asyncio 4 | import logging 5 | from typing import Optional 6 | 7 | from playwright.async_api import Browser, Page, async_playwright 8 | 9 | from pyvisionai.config.html_config import ( 10 | AD_SELECTORS, 11 | DEFAULT_CONFIG, 12 | WAIT_SELECTORS, 13 | ) 14 | 15 | logger = logging.getLogger(__name__) 16 | 17 | 18 | async def setup_browser(config: Optional[dict] = None) -> Browser: 19 | """Set up and return a browser instance.""" 20 | config = config or DEFAULT_CONFIG 21 | browser_config = config["browser"] 22 | 23 | playwright = await async_playwright().start() 24 | browser = await playwright.chromium.launch( 25 | headless=browser_config["headless"] 26 | ) 27 | return browser 28 | 29 | 30 | async def setup_page( 31 | browser: Browser, config: Optional[dict] = None 32 | ) -> Page: 33 | """Set up and return a page with configured viewport and settings.""" 34 | config = config or DEFAULT_CONFIG 35 | viewport = config["viewport"] 36 | 37 | page = await browser.new_page() 38 | await page.set_viewport_size( 39 | {"width": viewport["width"], "height": viewport["height"]} 40 | ) 41 | await page.set_extra_http_headers( 42 | {"User-Agent": config["browser"]["user_agent"]} 43 | ) 44 | return page 45 | 46 | 47 | async def process_page( 48 | url: str, config: Optional[dict] = None 49 | ) -> bytes: 50 | """Process a webpage and return its screenshot.""" 51 | config = config or DEFAULT_CONFIG 52 | timeout = config["timeout"] 53 | content = config["content"] 54 | 55 | async with async_playwright() as p: 56 | browser = await p.chromium.launch( 57 | headless=config["browser"]["headless"] 58 | ) 59 | page = await setup_page(browser, config) 60 | 61 | try: 62 | # Navigate to the page 63 | await page.goto( 64 | url, 65 | wait_until="networkidle", 66 | timeout=timeout["page_load"], 67 | ) 68 | 69 | # Wait for key elements 70 | if content["wait_for_images"]: 71 | for selector in WAIT_SELECTORS: 72 | try: 73 | await page.wait_for_selector( 74 | selector, 75 | state="visible", 76 | timeout=timeout["wait_for_idle"], 77 | ) 78 | except TimeoutError: 79 | # Skip if element not found within timeout 80 | continue 81 | except Exception as e: 82 | logger.warning( 83 | f"Error waiting for selector {selector}: {str(e)}" 84 | ) 85 | continue 86 | 87 | # Remove ads if configured 88 | if content["remove_ads"]: 89 | for selector in AD_SELECTORS: 90 | try: 91 | await page.evaluate( 92 | f""" 93 | document.querySelectorAll('{selector}') 94 | .forEach(el => el.remove()) 95 | """ 96 | ) 97 | except Exception as e: 98 | logger.warning( 99 | f"Error removing ads with selector {selector}: {str(e)}" 100 | ) 101 | continue 102 | 103 | # Add small delay for final renders 104 | await page.wait_for_timeout(timeout["render_delay"]) 105 | 106 | # Take screenshot 107 | screenshot = await page.screenshot( 108 | full_page=config["screenshot"]["full_page"], 109 | type=config["screenshot"]["format"], 110 | quality=config["screenshot"]["quality"], 111 | ) 112 | 113 | return screenshot 114 | 115 | finally: 116 | await browser.close() 117 | 118 | 119 | def capture_webpage(url: str, config: Optional[dict] = None) -> bytes: 120 | """Synchronous wrapper for processing webpage.""" 121 | return asyncio.run(process_page(url, config)) 122 | -------------------------------------------------------------------------------- /pyvisionai/extractors/html_page.py: -------------------------------------------------------------------------------- 1 | """HTML page-as-image extractor.""" 2 | 3 | import io 4 | import os 5 | import tempfile 6 | 7 | from PIL import Image 8 | 9 | from pyvisionai.config.html_config import DEFAULT_CONFIG 10 | from pyvisionai.extractors.base import BaseExtractor 11 | from pyvisionai.extractors.html.browser import capture_webpage 12 | from pyvisionai.utils.logger import logger 13 | 14 | 15 | class HtmlPageImageExtractor(BaseExtractor): 16 | """Extract content from HTML files by converting pages to images.""" 17 | 18 | def save_image( 19 | self, image_data: bytes, output_dir: str, image_name: str 20 | ) -> str: 21 | """Save an image to the output directory.""" 22 | try: 23 | # Convert bytes to PIL Image 24 | image = Image.open(io.BytesIO(image_data)) 25 | # Convert to RGB if necessary 26 | if image.mode != "RGB": 27 | image = image.convert("RGB") 28 | # Save as JPEG 29 | img_path = os.path.join(output_dir, f"{image_name}.jpg") 30 | image.save(img_path, "JPEG", quality=95) 31 | return img_path 32 | except Exception as e: 33 | logger.error(f"Error saving image: {str(e)}") 34 | raise 35 | 36 | def extract(self, html_path: str, output_dir: str) -> str: 37 | """Process HTML file by converting to image.""" 38 | try: 39 | html_filename = os.path.splitext( 40 | os.path.basename(html_path) 41 | )[0] 42 | 43 | # Create temporary directory for page images 44 | pages_dir = os.path.join( 45 | output_dir, f"{html_filename}_pages" 46 | ) 47 | if not os.path.exists(pages_dir): 48 | os.makedirs(pages_dir) 49 | 50 | # Read HTML file content 51 | with open(html_path, "r", encoding="utf-8") as f: 52 | html_content = f.read() 53 | 54 | # Create temporary HTML file with absolute paths 55 | with tempfile.NamedTemporaryFile( 56 | mode="w", suffix=".html", delete=False 57 | ) as temp_html: 58 | # Convert relative paths to absolute 59 | base_dir = os.path.dirname(os.path.abspath(html_path)) 60 | html_content = html_content.replace( 61 | 'src="', f'src="{base_dir}/' 62 | ) 63 | html_content = html_content.replace( 64 | "src='", f"src='{base_dir}/" 65 | ) 66 | temp_html.write(html_content) 67 | temp_path = temp_html.name 68 | 69 | try: 70 | # Capture webpage as image 71 | screenshot = capture_webpage( 72 | f"file://{temp_path}", DEFAULT_CONFIG 73 | ) 74 | 75 | # Save screenshot 76 | image_name = "page_1" 77 | img_path = self.save_image( 78 | screenshot, pages_dir, image_name 79 | ) 80 | 81 | # Get page description using configured model 82 | page_description = self.describe_image(img_path) 83 | 84 | # Generate markdown content 85 | md_content = f"# {html_filename}\n\n" 86 | md_content += "## Page 1\n\n" 87 | md_content += "[Image 1]\n" 88 | md_content += f"Description: {page_description}\n\n" 89 | 90 | # Save markdown file 91 | md_file_path = os.path.join( 92 | output_dir, f"{html_filename}_html.md" 93 | ) 94 | with open( 95 | md_file_path, "w", encoding="utf-8" 96 | ) as md_file: 97 | md_file.write(md_content) 98 | 99 | # Clean up image file 100 | os.remove(img_path) 101 | 102 | logger.info("Processing HTML file...") 103 | logger.info(f"Extracted content and saved to markdown") 104 | logger.info("HTML processing completed successfully") 105 | 106 | return md_file_path 107 | 108 | finally: 109 | # Clean up temporary HTML file 110 | os.remove(temp_path) 111 | # Clean up pages directory 112 | os.rmdir(pages_dir) 113 | 114 | except Exception as e: 115 | logger.error(f"Error processing HTML: {str(e)}") 116 | raise 117 | -------------------------------------------------------------------------------- /pyvisionai/extractors/pdf_page.py: -------------------------------------------------------------------------------- 1 | """PDF page-as-image extractor.""" 2 | 3 | import concurrent.futures 4 | import logging 5 | import os 6 | from typing import Tuple 7 | 8 | from pdf2image import convert_from_path 9 | from PIL import Image 10 | 11 | from pyvisionai.extractors.base import BaseExtractor 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | class PDFPageImageExtractor(BaseExtractor): 17 | """Extract content from PDF files by converting pages to images.""" 18 | 19 | def convert_pages_to_images(self, pdf_path: str) -> list: 20 | """Convert PDF pages to images.""" 21 | return convert_from_path(pdf_path, dpi=300) 22 | 23 | def save_image( 24 | self, image: Image.Image, output_dir: str, image_name: str 25 | ) -> str: 26 | """Save an image to the output directory.""" 27 | img_path = os.path.join(output_dir, f"{image_name}.jpg") 28 | image.save(img_path, "JPEG", quality=95) 29 | return img_path 30 | 31 | def process_page( 32 | self, page_data: Tuple[int, Image.Image], pages_dir: str 33 | ) -> Tuple[int, str]: 34 | """Process a single page. 35 | 36 | Args: 37 | page_data: Tuple of (page number, page image) 38 | pages_dir: Directory to save page images 39 | 40 | Returns: 41 | Tuple of (page number, page description) 42 | """ 43 | try: 44 | page_num, image = page_data 45 | # Save page image 46 | image_name = f"page_{page_num + 1}" 47 | img_path = self.save_image(image, pages_dir, image_name) 48 | 49 | # Get page description using configured model 50 | page_description = self.describe_image(img_path) 51 | 52 | # Clean up image file 53 | os.remove(img_path) 54 | 55 | return page_num, page_description 56 | except Exception as e: 57 | logger.error( 58 | f"Error processing page {page_num + 1}: {str(e)}" 59 | ) 60 | return ( 61 | page_num, 62 | f"Error: Could not process page {page_num + 1}", 63 | ) 64 | 65 | def extract(self, pdf_path: str, output_dir: str) -> str: 66 | """Process PDF file by converting each page to an image.""" 67 | try: 68 | pdf_filename = os.path.splitext(os.path.basename(pdf_path))[ 69 | 0 70 | ] 71 | 72 | # Create temporary directory for page images 73 | pages_dir = os.path.join( 74 | output_dir, f"{pdf_filename}_pages" 75 | ) 76 | if not os.path.exists(pages_dir): 77 | os.makedirs(pages_dir) 78 | 79 | logger.info("Processing PDF file...") 80 | # Convert PDF pages to images 81 | images = self.convert_pages_to_images(pdf_path) 82 | logger.info(f"Converting {len(images)} pages to images") 83 | 84 | # Generate markdown content 85 | md_content = f"# {pdf_filename}\n\n" 86 | 87 | # Process pages in parallel 88 | descriptions = [""] * len(images) 89 | with concurrent.futures.ThreadPoolExecutor( 90 | max_workers=4 91 | ) as executor: 92 | # Create page tasks 93 | page_tasks = [(i, img) for i, img in enumerate(images)] 94 | 95 | # Submit all tasks 96 | future_to_page = { 97 | executor.submit( 98 | self.process_page, task, pages_dir 99 | ): task[0] 100 | for task in page_tasks 101 | } 102 | 103 | # Collect results as they complete 104 | for future in concurrent.futures.as_completed( 105 | future_to_page 106 | ): 107 | page_num, description = future.result() 108 | descriptions[page_num] = description 109 | 110 | # Add descriptions to markdown in correct order 111 | for page_num, description in enumerate(descriptions): 112 | md_content += f"## Page {page_num + 1}\n\n" 113 | md_content += f"[Image {page_num + 1}]\n" 114 | md_content += f"Description: {description}\n\n" 115 | 116 | # Save markdown file 117 | md_file_path = os.path.join( 118 | output_dir, f"{pdf_filename}_pdf.md" 119 | ) 120 | with open(md_file_path, "w", encoding="utf-8") as md_file: 121 | md_file.write(md_content) 122 | 123 | # Clean up pages directory after all pages are processed 124 | os.rmdir(pages_dir) 125 | 126 | logger.info("PDF processing completed successfully") 127 | return md_file_path 128 | 129 | except Exception as e: 130 | logger.error(f"Error processing PDF: {str(e)}") 131 | raise 132 | -------------------------------------------------------------------------------- /pyvisionai/extractors/pptx.py: -------------------------------------------------------------------------------- 1 | """Extract text and images from PPTX files.""" 2 | 3 | import concurrent.futures 4 | import io 5 | import os 6 | from dataclasses import dataclass 7 | from typing import List, Tuple 8 | 9 | from PIL import Image 10 | from pptx import Presentation 11 | 12 | from pyvisionai.extractors.base import BaseExtractor 13 | from pyvisionai.utils.logger import logger 14 | 15 | 16 | @dataclass 17 | class ImageTask: 18 | """Container for image processing task data.""" 19 | 20 | image_data: bytes 21 | image_name: str 22 | output_dir: str 23 | index: int 24 | 25 | 26 | class PptxTextImageExtractor(BaseExtractor): 27 | """Extract text and images from PPTX files.""" 28 | 29 | def extract_text_and_images( 30 | self, pptx_path: str 31 | ) -> Tuple[List[str], List[bytes]]: 32 | """Extract text and images from PPTX file.""" 33 | prs = Presentation(pptx_path) 34 | texts = [] 35 | images = [] 36 | 37 | # Extract text and images from slides 38 | for slide in prs.slides: 39 | # Extract text from shapes 40 | slide_text = [] 41 | for shape in slide.shapes: 42 | if hasattr(shape, "text") and shape.text.strip(): 43 | slide_text.append(shape.text) 44 | texts.append("\n".join(slide_text)) 45 | 46 | # Extract images from relationships 47 | for rel in slide.part.rels.values(): 48 | if "image" in rel.reltype: 49 | try: 50 | image_data = rel.target_part.blob 51 | images.append(image_data) 52 | except Exception as e: 53 | logger.error( 54 | f"Error extracting image: {str(e)}" 55 | ) 56 | continue 57 | 58 | return texts, images 59 | 60 | def save_image( 61 | self, image_data: bytes, output_dir: str, image_name: str 62 | ) -> str: 63 | """Save an image to the output directory.""" 64 | try: 65 | # Convert image data to PIL Image 66 | image = Image.open(io.BytesIO(image_data)) 67 | # Convert to RGB if necessary 68 | if image.mode != "RGB": 69 | image = image.convert("RGB") 70 | # Save as JPEG (supported format) 71 | img_path = os.path.join(output_dir, f"{image_name}.jpg") 72 | image.save(img_path, "JPEG", quality=95) 73 | return img_path 74 | except Exception as e: 75 | logger.error(f"Error saving image: {str(e)}") 76 | raise 77 | 78 | def process_image_task(self, task: ImageTask) -> tuple[int, str]: 79 | """Process a single image task.""" 80 | try: 81 | img_path = self.save_image( 82 | task.image_data, task.output_dir, task.image_name 83 | ) 84 | image_description = self.describe_image(img_path) 85 | os.remove(img_path) # Clean up 86 | return task.index, image_description 87 | except Exception as e: 88 | logger.error( 89 | f"Error processing image {task.image_name}: {str(e)}" 90 | ) 91 | return ( 92 | task.index, 93 | f"Error: Could not process image {task.image_name}", 94 | ) 95 | 96 | def extract(self, pptx_path: str, output_dir: str) -> str: 97 | """Process PPTX file by extracting text and images separately.""" 98 | try: 99 | pptx_filename = os.path.splitext( 100 | os.path.basename(pptx_path) 101 | )[0] 102 | 103 | # Extract text and images 104 | texts, images = self.extract_text_and_images(pptx_path) 105 | 106 | # Generate markdown content 107 | md_content = f"# {pptx_filename}\n\n" 108 | 109 | # Add text content 110 | for slide_num, text in enumerate(texts, 1): 111 | if text: 112 | md_content += f"## Slide {slide_num}\n\n" 113 | md_content += f"{text}\n\n" 114 | 115 | # Prepare image tasks 116 | image_tasks = [] 117 | for img_index, img_data in enumerate(images): 118 | image_name = f"{pptx_filename}_image_{img_index + 1}" 119 | task = ImageTask( 120 | image_data=img_data, 121 | image_name=image_name, 122 | output_dir=output_dir, 123 | index=img_index, 124 | ) 125 | image_tasks.append(task) 126 | 127 | # Process images in parallel if there are any 128 | if image_tasks: 129 | # Store descriptions in order 130 | descriptions = [""] * len(image_tasks) 131 | 132 | # Use ThreadPoolExecutor for parallel processing 133 | with concurrent.futures.ThreadPoolExecutor( 134 | max_workers=4 135 | ) as executor: 136 | # Submit all tasks 137 | future_to_task = { 138 | executor.submit( 139 | self.process_image_task, task 140 | ): task 141 | for task in image_tasks 142 | } 143 | 144 | # Collect results as they complete 145 | for future in concurrent.futures.as_completed( 146 | future_to_task 147 | ): 148 | idx, description = future.result() 149 | descriptions[idx] = description 150 | 151 | # Add descriptions to markdown in correct order 152 | for img_index, description in enumerate(descriptions): 153 | md_content += f"[Image {img_index + 1}]\n" 154 | md_content += f"Description: {description}\n\n" 155 | 156 | # Save markdown file 157 | md_file_path = os.path.join( 158 | output_dir, f"{pptx_filename}_pptx.md" 159 | ) 160 | with open(md_file_path, "w", encoding="utf-8") as md_file: 161 | md_file.write(md_content) 162 | 163 | # Add info logging 164 | logger.info("Processing PPTX file...") 165 | logger.info(f"Extracted {len(images)} images") 166 | logger.info("PPTX processing completed successfully") 167 | 168 | return md_file_path 169 | 170 | except Exception as e: 171 | logger.error(f"Error processing PPTX: {str(e)}") 172 | raise 173 | -------------------------------------------------------------------------------- /pyvisionai/extractors/pptx_page.py: -------------------------------------------------------------------------------- 1 | """PPTX page-as-image extractor. 2 | 3 | Converts PPTX files to images by first converting to PDF using LibreOffice, 4 | then converting PDF pages to images using pdf2image.""" 5 | 6 | import concurrent.futures 7 | import logging 8 | import os 9 | import subprocess 10 | import tempfile 11 | import time 12 | from dataclasses import dataclass 13 | 14 | from PIL import Image 15 | 16 | from pyvisionai.extractors.base import BaseExtractor 17 | 18 | logger = logging.getLogger(__name__) 19 | 20 | 21 | @dataclass 22 | class SlideTask: 23 | """Container for slide processing task data.""" 24 | 25 | image: Image.Image 26 | image_name: str 27 | output_dir: str 28 | index: int 29 | 30 | 31 | class PptxPageImageExtractor(BaseExtractor): 32 | """Extract content from PPTX files by converting slides to images. 33 | 34 | Uses LibreOffice to convert PPTX to PDF, then pdf2image to convert 35 | PDF pages to images. Each slide is processed in parallel to generate 36 | descriptions.""" 37 | 38 | def convert_to_pdf(self, pptx_path: str) -> str: 39 | """Convert PPTX to PDF using LibreOffice.""" 40 | try: 41 | # Create a temporary directory for the PDF 42 | temp_dir = tempfile.mkdtemp() 43 | 44 | # Get absolute paths 45 | abs_pptx_path = os.path.abspath(pptx_path) 46 | abs_temp_dir = os.path.abspath(temp_dir) 47 | 48 | # The output PDF will have the same name as the input PPTX 49 | pptx_filename = os.path.splitext( 50 | os.path.basename(pptx_path) 51 | )[0] 52 | pdf_path = os.path.join( 53 | abs_temp_dir, f"{pptx_filename}.pdf" 54 | ) 55 | 56 | # Convert PPTX to PDF using LibreOffice 57 | cmd = [ 58 | "soffice", 59 | "--headless", 60 | "--convert-to", 61 | "pdf", 62 | "--outdir", 63 | abs_temp_dir, 64 | abs_pptx_path, 65 | ] 66 | 67 | # Run the command and capture output 68 | result = subprocess.run( 69 | cmd, check=True, capture_output=True, text=True 70 | ) 71 | 72 | # Wait a moment for the file to be written 73 | time.sleep(1) 74 | 75 | # Verify the PDF was created 76 | if not os.path.exists(pdf_path): 77 | raise FileNotFoundError( 78 | f"PDF file was not created. LibreOffice output:\n" 79 | f"STDOUT: {result.stdout}\n" 80 | f"STDERR: {result.stderr}" 81 | ) 82 | 83 | return pdf_path 84 | 85 | except subprocess.CalledProcessError as e: 86 | raise RuntimeError( 87 | f"LibreOffice conversion failed:\n" 88 | f"STDOUT: {e.stdout}\n" 89 | f"STDERR: {e.stderr}" 90 | ) from e 91 | except Exception as e: 92 | raise RuntimeError( 93 | f"Failed to convert PPTX to PDF: {str(e)}" 94 | ) from e 95 | 96 | def convert_pages_to_images(self, pdf_path: str) -> list: 97 | """Convert PDF pages to images using pdf2image.""" 98 | from pdf2image import convert_from_path 99 | 100 | return convert_from_path(pdf_path, dpi=300) 101 | 102 | def save_image( 103 | self, image: Image.Image, output_dir: str, image_name: str 104 | ) -> str: 105 | """Save an image to the output directory.""" 106 | img_path = os.path.join(output_dir, f"{image_name}.jpg") 107 | image.save(img_path, "JPEG", quality=95) 108 | return img_path 109 | 110 | def process_slide(self, task: SlideTask) -> tuple[int, str]: 111 | """Process a single slide. 112 | 113 | Saves the slide as an image, generates a description using the configured 114 | model, then cleans up the image file. 115 | 116 | Args: 117 | task: SlideTask containing the slide image and processing details 118 | 119 | Returns: 120 | Tuple of (slide index, description) 121 | """ 122 | try: 123 | # Save slide image 124 | img_path = self.save_image( 125 | task.image, task.output_dir, task.image_name 126 | ) 127 | 128 | # Get slide description using configured model 129 | slide_description = self.describe_image(img_path) 130 | 131 | # Clean up image file 132 | os.remove(img_path) 133 | 134 | return task.index, slide_description 135 | except Exception as e: 136 | logger.error( 137 | f"Error processing slide {task.image_name}: {str(e)}" 138 | ) 139 | return ( 140 | task.index, 141 | f"Error: Could not process slide {task.image_name}", 142 | ) 143 | 144 | def extract(self, pptx_path: str, output_dir: str) -> str: 145 | """Process PPTX file by converting each slide to an image.""" 146 | try: 147 | logger.info("Processing PPTX file...") 148 | 149 | pptx_filename = os.path.splitext( 150 | os.path.basename(pptx_path) 151 | )[0] 152 | 153 | # Create temporary directory for slide images 154 | slides_dir = os.path.join( 155 | output_dir, f"{pptx_filename}_slides" 156 | ) 157 | if not os.path.exists(slides_dir): 158 | os.makedirs(slides_dir) 159 | 160 | # Convert PPTX to PDF first 161 | pdf_path = self.convert_to_pdf(pptx_path) 162 | logger.info("Converted PPTX to PDF") 163 | 164 | # Convert PDF pages to images 165 | images = self.convert_pages_to_images(pdf_path) 166 | logger.info(f"Converting {len(images)} slides to images") 167 | 168 | # Generate markdown content 169 | md_content = f"# {pptx_filename}\n\n" 170 | 171 | # Create slide tasks 172 | slide_tasks = [] 173 | for slide_num, image in enumerate(images): 174 | image_name = f"slide_{slide_num + 1}" 175 | task = SlideTask( 176 | image=image, 177 | image_name=image_name, 178 | output_dir=slides_dir, 179 | index=slide_num, 180 | ) 181 | slide_tasks.append(task) 182 | 183 | # Process slides in parallel 184 | descriptions = [""] * len(slide_tasks) 185 | with concurrent.futures.ThreadPoolExecutor( 186 | max_workers=4 187 | ) as executor: 188 | # Submit all tasks 189 | future_to_task = { 190 | executor.submit(self.process_slide, task): task 191 | for task in slide_tasks 192 | } 193 | 194 | # Collect results as they complete 195 | for future in concurrent.futures.as_completed( 196 | future_to_task 197 | ): 198 | idx, description = future.result() 199 | descriptions[idx] = description 200 | 201 | # Add descriptions to markdown in correct order 202 | for slide_num, description in enumerate(descriptions): 203 | md_content += f"## Slide {slide_num + 1}\n\n" 204 | md_content += f"[Image {slide_num + 1}]\n" 205 | md_content += f"Description: {description}\n\n" 206 | 207 | # Save markdown file 208 | md_file_path = os.path.join( 209 | output_dir, f"{pptx_filename}_pptx.md" 210 | ) 211 | with open(md_file_path, "w", encoding="utf-8") as md_file: 212 | md_file.write(md_content) 213 | 214 | # Clean up temporary files 215 | os.remove(pdf_path) 216 | os.rmdir( 217 | os.path.dirname(pdf_path) 218 | ) # Remove temp PDF directory 219 | os.rmdir(slides_dir) # Remove slides directory 220 | 221 | logger.info("PPTX processing completed successfully") 222 | return md_file_path 223 | 224 | except Exception as e: 225 | logger.error(f"Error processing PPTX: {str(e)}") 226 | raise 227 | -------------------------------------------------------------------------------- /pyvisionai/utils/__init__.py: -------------------------------------------------------------------------------- 1 | """Utility functions and helpers.""" 2 | 3 | from .config import ( 4 | CONTENT_DIR, 5 | DEFAULT_IMAGE_MODEL, 6 | DEFAULT_PDF_EXTRACTOR, 7 | EXTRACTED_DIR, 8 | LOG_DIR, 9 | OLLAMA_HOST, 10 | OPENAI_API_KEY, 11 | SOURCE_DIR, 12 | ) 13 | from .logger import logger, setup_logger 14 | from .retry import ( 15 | APIError, 16 | ConnectionError, 17 | RateLimitError, 18 | RetryableError, 19 | RetryManager, 20 | RetryStrategy, 21 | TemporaryError, 22 | ) 23 | 24 | __all__ = [ 25 | "logger", 26 | "setup_logger", 27 | "RetryManager", 28 | "RetryStrategy", 29 | "RetryableError", 30 | "APIError", 31 | "RateLimitError", 32 | "TemporaryError", 33 | "ConnectionError", 34 | "DEFAULT_PDF_EXTRACTOR", 35 | "DEFAULT_IMAGE_MODEL", 36 | "OPENAI_API_KEY", 37 | "OLLAMA_HOST", 38 | "CONTENT_DIR", 39 | "SOURCE_DIR", 40 | "EXTRACTED_DIR", 41 | "LOG_DIR", 42 | ] 43 | -------------------------------------------------------------------------------- /pyvisionai/utils/benchmark.py: -------------------------------------------------------------------------------- 1 | """Benchmark management utilities.""" 2 | 3 | import json 4 | import logging 5 | from dataclasses import asdict, dataclass 6 | from datetime import datetime 7 | from pathlib import Path 8 | from typing import Dict, Optional, Union 9 | 10 | from filelock import FileLock 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | @dataclass 16 | class BenchmarkMetrics: 17 | """Container for benchmark metrics.""" 18 | 19 | interface: str 20 | output_size: int 21 | cli_time: Optional[float] = None 22 | setup_time: Optional[float] = None 23 | extraction_time: Optional[float] = None 24 | 25 | def validate(self) -> None: 26 | """Validate metric values.""" 27 | if ( 28 | not isinstance(self.output_size, int) 29 | or self.output_size < 0 30 | ): 31 | raise ValueError( 32 | "output_size must be a non-negative integer" 33 | ) 34 | 35 | if self.interface not in ["cli", "api"]: 36 | raise ValueError("interface must be either 'cli' or 'api'") 37 | 38 | if self.interface == "cli" and self.cli_time is None: 39 | raise ValueError("cli_time is required for CLI interface") 40 | 41 | if self.interface == "api" and not ( 42 | self.cli_time or self.extraction_time 43 | ): 44 | raise ValueError( 45 | "Either cli_time or extraction_time is required for API interface" 46 | ) 47 | 48 | 49 | @dataclass 50 | class BenchmarkEntry: 51 | """Container for benchmark data.""" 52 | 53 | test: Dict[str, str] 54 | metrics: BenchmarkMetrics 55 | 56 | @classmethod 57 | def from_dict(cls, data: Dict) -> 'BenchmarkEntry': 58 | """Create BenchmarkEntry from dictionary.""" 59 | return cls( 60 | test=data["test"], 61 | metrics=BenchmarkMetrics(**data["metrics"]), 62 | ) 63 | 64 | 65 | class MetricNormalizer: 66 | """Handles normalization of benchmark metrics.""" 67 | 68 | @staticmethod 69 | def normalize(metrics: Dict) -> BenchmarkMetrics: 70 | """Normalize metrics to standard format. 71 | 72 | For CLI interface: 73 | - Uses cli_time as the primary timing metric 74 | For API interface: 75 | - Uses extraction_time if available, falls back to cli_time 76 | - setup_time is optional and defaults to 0 77 | """ 78 | if metrics.get("interface") == "cli": 79 | return BenchmarkMetrics( 80 | interface="cli", 81 | output_size=metrics["output_size"], 82 | cli_time=metrics.get("cli_time") 83 | or metrics.get("total_time"), 84 | ) 85 | else: 86 | # For API, use extraction_time if available, otherwise fall back to cli_time 87 | setup_time = metrics.get("setup_time", 0) 88 | total_time = metrics.get("cli_time", 0) 89 | extraction_time = metrics.get("extraction_time", total_time) 90 | 91 | return BenchmarkMetrics( 92 | interface="api", 93 | output_size=metrics["output_size"], 94 | setup_time=setup_time, 95 | extraction_time=extraction_time, 96 | cli_time=total_time or (setup_time + extraction_time), 97 | ) 98 | 99 | 100 | class BenchmarkLogger: 101 | """Handles benchmark logging with file locking.""" 102 | 103 | def __init__(self, log_dir: Union[str, Path]): 104 | self.log_dir = Path(log_dir) 105 | self.log_file = self.log_dir / "benchmark.log" 106 | self._setup_logging() 107 | 108 | def _setup_logging(self): 109 | """Set up logging directory.""" 110 | self.log_dir.mkdir(parents=True, exist_ok=True) 111 | 112 | def log(self, file_type: str, method: str, metrics: Dict) -> None: 113 | """Log benchmark entry with file locking.""" 114 | try: 115 | # Normalize metrics 116 | normalized_metrics = MetricNormalizer.normalize(metrics) 117 | normalized_metrics.validate() 118 | 119 | # Create entry 120 | entry = BenchmarkEntry( 121 | test={ 122 | "file_type": file_type, 123 | "method": method, 124 | "timestamp": datetime.now().isoformat(), 125 | }, 126 | metrics=normalized_metrics, 127 | ) 128 | 129 | # Write to log file with lock 130 | lock_file = str(self.log_file) + ".lock" 131 | with FileLock(lock_file): 132 | with open(self.log_file, "a") as f: 133 | f.write(json.dumps(asdict(entry)) + "\n") 134 | 135 | # Log to console 136 | logger.info( 137 | f"Benchmark - {file_type} ({method}): " 138 | f"CLI Time: {normalized_metrics.cli_time:.2f}s, " 139 | f"Output Size: {normalized_metrics.output_size} bytes" 140 | ) 141 | 142 | except Exception as e: 143 | logger.error(f"Failed to log benchmark: {str(e)}") 144 | raise 145 | -------------------------------------------------------------------------------- /pyvisionai/utils/config.py: -------------------------------------------------------------------------------- 1 | """Configuration constants for PyVisionAI.""" 2 | 3 | import os 4 | 5 | # Default directories 6 | CONTENT_DIR = "content" 7 | SOURCE_DIR = os.path.join(CONTENT_DIR, "source") 8 | EXTRACTED_DIR = os.path.join(CONTENT_DIR, "extracted") 9 | LOG_DIR = os.path.join(CONTENT_DIR, "log") 10 | 11 | # Default settings 12 | DEFAULT_IMAGE_MODEL = "gpt4" # Default to GPT-4 for best results 13 | DEFAULT_PDF_EXTRACTOR = "page_as_image" # or "text_and_images" 14 | 15 | # Model names 16 | OLLAMA_MODEL_NAME = "llama3.2-vision" # Default Ollama model 17 | OPENAI_MODEL_NAME = "gpt-4o-mini" # Default OpenAI model 18 | 19 | # Default prompts for image description 20 | DEFAULT_PROMPT = ( 21 | "Describe this image in detail. Preserve as much of the precise " 22 | "original text, format, images and style as possible." 23 | ) 24 | 25 | # API keys 26 | OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") 27 | OLLAMA_HOST = os.getenv("OLLAMA_HOST", "http://localhost:11434") 28 | 29 | # Create directories if they don't exist 30 | for directory in [CONTENT_DIR, SOURCE_DIR, EXTRACTED_DIR, LOG_DIR]: 31 | os.makedirs(directory, exist_ok=True) 32 | -------------------------------------------------------------------------------- /pyvisionai/utils/logger.py: -------------------------------------------------------------------------------- 1 | """Logging configuration for the pyvisionai package.""" 2 | 3 | import logging 4 | import os 5 | from datetime import datetime 6 | from pathlib import Path 7 | 8 | 9 | def setup_logger( 10 | name: str = "pyvisionai", log_dir: str | Path | None = None 11 | ) -> logging.Logger: 12 | """ 13 | Set up a logger with file and console handlers. 14 | 15 | Args: 16 | name: Name for the logger (default: pyvisionai) 17 | log_dir: Directory for log files (optional) 18 | 19 | Returns: 20 | logging.Logger: Configured logger instance 21 | """ 22 | logger = logging.getLogger(name) 23 | logger.setLevel(logging.INFO) 24 | 25 | # Remove any existing handlers 26 | for handler in logger.handlers[:]: 27 | logger.removeHandler(handler) 28 | 29 | # Create formatters 30 | console_formatter = logging.Formatter("%(message)s") 31 | file_formatter = logging.Formatter( 32 | "%(asctime)s - %(levelname)s - %(message)s" 33 | ) 34 | 35 | # Console handler 36 | console_handler = logging.StreamHandler() 37 | console_handler.setLevel(logging.INFO) 38 | console_handler.setFormatter(console_formatter) 39 | logger.addHandler(console_handler) 40 | 41 | # File handler (if log_dir is provided) 42 | if log_dir is not None: 43 | log_dir = Path(log_dir) 44 | log_dir.mkdir(parents=True, exist_ok=True) 45 | log_file = log_dir / f"{name.replace('.', '_')}.log" 46 | file_handler = logging.FileHandler(str(log_file)) 47 | file_handler.setLevel(logging.INFO) 48 | file_handler.setFormatter(file_formatter) 49 | logger.addHandler(file_handler) 50 | 51 | return logger 52 | 53 | 54 | # Create the default logger instance 55 | logger = setup_logger() 56 | -------------------------------------------------------------------------------- /pyvisionai/utils/retry.py: -------------------------------------------------------------------------------- 1 | """Retry mechanism for handling transient failures.""" 2 | 3 | import logging 4 | import time 5 | from enum import Enum 6 | from typing import Callable, Optional, TypeVar 7 | 8 | import requests 9 | 10 | T = TypeVar('T') 11 | 12 | 13 | class RetryStrategy(Enum): 14 | """Available retry delay strategies.""" 15 | 16 | EXPONENTIAL = "exponential" # Exponential backoff 17 | LINEAR = "linear" # Linear backoff 18 | CONSTANT = "constant" # Constant delay 19 | 20 | 21 | class RetryableError(Exception): 22 | """Base class for errors that should trigger retry.""" 23 | 24 | pass 25 | 26 | 27 | class APIError(RetryableError): 28 | """Base class for API-related errors.""" 29 | 30 | pass 31 | 32 | 33 | class RateLimitError(APIError): 34 | """Error raised when API rate limit is exceeded.""" 35 | 36 | pass 37 | 38 | 39 | class TemporaryError(APIError): 40 | """Error raised for temporary API issues (5xx errors).""" 41 | 42 | pass 43 | 44 | 45 | class ConnectionError(APIError): 46 | """Error raised for network connectivity issues.""" 47 | 48 | pass 49 | 50 | 51 | def is_retryable_http_error(e: Exception) -> bool: 52 | """ 53 | Check if an HTTP error should trigger a retry. 54 | 55 | Args: 56 | e: The exception to check 57 | 58 | Returns: 59 | bool: True if the error should trigger a retry 60 | """ 61 | # Handle Anthropic errors 62 | if e.__class__.__name__ == "APIError": 63 | error_msg = str(e).lower() 64 | return ( 65 | "rate limit" in error_msg 66 | or "server error" in error_msg 67 | or "overloaded" in error_msg 68 | or "529" in error_msg 69 | ) 70 | 71 | # Handle OpenAI errors 72 | if e.__class__.__name__ == "OpenAIError": 73 | error_msg = str(e).lower() 74 | return "rate limit" in error_msg or "server error" in error_msg 75 | 76 | if isinstance(e, requests.exceptions.RequestException): 77 | if isinstance(e, requests.exceptions.HTTPError): 78 | # Retry on rate limits (429), server errors (5xx), and overloaded (529) 79 | return e.response.status_code in [429, 529] + list( 80 | range(500, 600) 81 | ) 82 | # Retry on connection errors, timeouts etc. 83 | return isinstance( 84 | e, 85 | ( 86 | requests.exceptions.ConnectionError, 87 | requests.exceptions.Timeout, 88 | ), 89 | ) 90 | return False 91 | 92 | 93 | def convert_error(e: Exception) -> Exception: 94 | """Convert API errors to appropriate retry errors.""" 95 | error_msg = str(e).lower() 96 | if "rate limit" in error_msg: 97 | return ConnectionError("Rate limit exceeded") 98 | elif "server error" in error_msg: 99 | return ConnectionError("Internal server error") 100 | elif "overloaded" in error_msg or "529" in error_msg: 101 | return ConnectionError("Service overloaded") 102 | return e 103 | 104 | 105 | class RetryManager: 106 | """Manages retry logic for operations that may fail transiently.""" 107 | 108 | def __init__( 109 | self, 110 | max_attempts: int = 3, 111 | strategy: RetryStrategy = RetryStrategy.EXPONENTIAL, 112 | base_delay: float = 1.0, 113 | max_delay: float = 30.0, 114 | logger: Optional[logging.Logger] = None, 115 | ): 116 | """ 117 | Initialize the retry manager. 118 | 119 | Args: 120 | max_attempts: Maximum number of attempts (including first try) 121 | strategy: Retry delay strategy to use 122 | base_delay: Base delay between retries in seconds 123 | max_delay: Maximum delay between retries in seconds 124 | logger: Logger instance to use (creates new if None) 125 | """ 126 | if max_attempts < 1: 127 | raise ValueError("max_attempts must be >= 1") 128 | if base_delay <= 0: 129 | raise ValueError("base_delay must be > 0") 130 | if max_delay < base_delay: 131 | raise ValueError("max_delay must be >= base_delay") 132 | 133 | self.max_attempts = max_attempts 134 | self.strategy = strategy 135 | self.base_delay = base_delay 136 | self.max_delay = max_delay 137 | self.logger = logger or logging.getLogger(__name__) 138 | 139 | def execute(self, operation: Callable[[], T]) -> T: 140 | """ 141 | Execute an operation with retry logic. 142 | 143 | Args: 144 | operation: Callable that may raise RetryableError 145 | 146 | Returns: 147 | The result of the operation if successful 148 | 149 | Raises: 150 | RetryableError: If max retries exceeded 151 | Exception: Any non-retryable error from operation 152 | """ 153 | last_error = None 154 | 155 | for attempt in range(self.max_attempts): 156 | try: 157 | return operation() 158 | except Exception as e: 159 | # Convert HTTP errors to retryable errors 160 | if isinstance(e, requests.exceptions.ConnectionError): 161 | error = ConnectionError(str(e)) 162 | elif is_retryable_http_error(e): 163 | error = convert_error(e) 164 | elif not isinstance(e, RetryableError): 165 | raise 166 | else: 167 | error = e 168 | 169 | last_error = error 170 | if attempt + 1 < self.max_attempts: 171 | delay = self._calculate_delay(attempt) 172 | self.logger.warning( 173 | f"Attempt {attempt + 1} failed: {str(error)}. " 174 | f"Retrying in {delay:.1f}s" 175 | ) 176 | time.sleep(delay) 177 | continue 178 | 179 | # Re-raise the last error 180 | raise last_error 181 | 182 | def _calculate_delay(self, attempt: int) -> float: 183 | """ 184 | Calculate delay for next retry based on strategy. 185 | 186 | Args: 187 | attempt: Current attempt number (0-based) 188 | 189 | Returns: 190 | float: Delay in seconds 191 | """ 192 | if self.strategy == RetryStrategy.EXPONENTIAL: 193 | delay = self.base_delay * (2**attempt) 194 | elif self.strategy == RetryStrategy.LINEAR: 195 | delay = self.base_delay * (attempt + 1) 196 | else: # CONSTANT 197 | delay = self.base_delay 198 | 199 | return min(delay, self.max_delay) 200 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MDGrey33/pyvisionai/da23f86e12e6576f5f65bf39f32f424896d853b3/tests/__init__.py -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | """Test configuration and shared fixtures.""" 2 | 3 | import json 4 | import logging 5 | import os 6 | import shutil 7 | from datetime import datetime 8 | from pathlib import Path 9 | from unittest.mock import patch 10 | 11 | import pytest 12 | 13 | from pyvisionai.utils.benchmark import BenchmarkLogger 14 | from pyvisionai.utils.logger import setup_logger 15 | 16 | 17 | # Configure logging 18 | def configure_test_logging(): 19 | """Configure logging to suppress verbose output.""" 20 | logging.getLogger("httpcore").setLevel(logging.ERROR) 21 | logging.getLogger("httpx").setLevel(logging.ERROR) 22 | logging.getLogger("openai").setLevel(logging.ERROR) 23 | logging.getLogger("anthropic").setLevel(logging.ERROR) 24 | 25 | # Disable propagation for these loggers 26 | for logger_name in ["httpcore", "httpx", "openai", "anthropic"]: 27 | logger = logging.getLogger(logger_name) 28 | logger.propagate = False 29 | 30 | 31 | logger = logging.getLogger(__name__) 32 | configure_test_logging() 33 | 34 | # Test data for file extraction 35 | testdata_file_extraction = [ 36 | ("pdf", "page_as_image"), 37 | ("pdf", "text_and_images"), 38 | ("docx", "page_as_image"), 39 | ("docx", "text_and_images"), 40 | ("pptx", "page_as_image"), 41 | ("pptx", "text_and_images"), 42 | ] 43 | 44 | ids_file_extraction = [ 45 | f"{filetype}-{method}" 46 | for filetype, method in testdata_file_extraction 47 | ] 48 | 49 | 50 | def copy_test_files(source_dir): 51 | """Copy test files to the test environment.""" 52 | test_files = { 53 | "pdf": "test.pdf", 54 | "docx": "test.docx", 55 | "pptx": "test.pptx", 56 | "html": "test.html", 57 | } 58 | 59 | for _, filename in test_files.items(): 60 | src = os.path.join("content", "test", "source", filename) 61 | dst = os.path.join(source_dir, filename) 62 | if os.path.exists(src): 63 | shutil.copy2(src, dst) 64 | else: 65 | # Create a simple HTML file for testing if it doesn't exist 66 | if filename.endswith(".html"): 67 | with open(dst, "w") as f: 68 | f.write( 69 | "

Test HTML

" 70 | ) 71 | else: 72 | raise FileNotFoundError(f"Test file not found: {src}") 73 | 74 | 75 | @pytest.fixture 76 | def benchmark_log_file(tmp_path): 77 | """Create a temporary benchmark log file.""" 78 | log_dir = tmp_path / "logs" 79 | log_dir.mkdir() 80 | return log_dir / "benchmark.log" 81 | 82 | 83 | @pytest.fixture 84 | def benchmark_logger(benchmark_log_file): 85 | """Create a benchmark logger with a temporary file.""" 86 | logger = BenchmarkLogger(log_dir=benchmark_log_file.parent) 87 | logger.logger = logging.getLogger("benchmark") 88 | logger.logger.setLevel(logging.INFO) 89 | return logger 90 | 91 | 92 | def log_benchmark(file_type, method, metrics, log_dir=None): 93 | """Log benchmark results using the benchmark logger. 94 | 95 | Args: 96 | file_type: Type of file being processed 97 | method: Extraction method used 98 | metrics: Dictionary containing benchmark metrics 99 | log_dir: Optional directory for log file (default: content/log) 100 | """ 101 | logger = BenchmarkLogger(log_dir or "content/log") 102 | logger.log(file_type, method, metrics) 103 | 104 | 105 | @pytest.fixture(autouse=True) 106 | def clean_benchmark_logs(): 107 | """Clean up benchmark logs after each test.""" 108 | yield 109 | log_file = Path("content/log/benchmark.log") 110 | lock_file = Path("content/log/benchmark.log.lock") 111 | if log_file.exists(): 112 | log_file.unlink() 113 | if lock_file.exists(): 114 | lock_file.unlink() 115 | 116 | 117 | def pytest_configure(config): 118 | """Register custom markers.""" 119 | config.addinivalue_line("markers", "integration: integration tests") 120 | config.addinivalue_line( 121 | "markers", "cli: command line interface tests" 122 | ) 123 | config.addinivalue_line( 124 | "markers", "openai: tests requiring OpenAI API" 125 | ) 126 | config.addinivalue_line( 127 | "markers", "claude: tests requiring Claude API" 128 | ) 129 | config.addinivalue_line("markers", "ollama: tests requiring Ollama") 130 | 131 | 132 | @pytest.fixture(autouse=True) 133 | def mock_sleep(): 134 | """Mock time.sleep globally to speed up tests.""" 135 | with patch('time.sleep'): 136 | yield 137 | 138 | 139 | @pytest.fixture(autouse=True) 140 | def skip_by_api_key(request): 141 | """Skip tests if required API key is missing.""" 142 | if request.node.get_closest_marker('openai'): 143 | if not os.getenv('OPENAI_API_KEY'): 144 | pytest.skip('OpenAI API key missing') 145 | elif request.node.get_closest_marker('claude'): 146 | if not os.getenv('ANTHROPIC_API_KEY'): 147 | pytest.skip('Anthropic API key missing') 148 | 149 | 150 | @pytest.fixture 151 | def test_image_path(): 152 | """Provide path to test image.""" 153 | return str(Path("content") / "test" / "source" / "test.jpeg") 154 | 155 | 156 | @pytest.fixture 157 | def mock_api_response(): 158 | """Provide mock API response.""" 159 | return {"text": "A forest scene with tall trees and green foliage."} 160 | 161 | 162 | @pytest.fixture 163 | def setup_test_env(tmp_path): 164 | """Set up test environment with required directories.""" 165 | content_dir = tmp_path / "content" 166 | source_dir = content_dir / "source" 167 | extracted_dir = content_dir / "extracted" 168 | log_dir = content_dir / "logs" 169 | 170 | # Create directories 171 | for dir_path in [content_dir, source_dir, extracted_dir, log_dir]: 172 | dir_path.mkdir(exist_ok=True) 173 | 174 | # Copy test files 175 | copy_test_files(str(source_dir)) 176 | 177 | # Set up logging 178 | setup_logger("pyvisionai.test", log_dir=log_dir) 179 | 180 | return { 181 | "content_dir": str(content_dir), 182 | "source_dir": str(source_dir), 183 | "extracted_dir": str(extracted_dir), 184 | "log_dir": str(log_dir), 185 | } 186 | -------------------------------------------------------------------------------- /tests/core/test_extractor.py: -------------------------------------------------------------------------------- 1 | """Tests for the BaseExtractor class.""" 2 | 3 | import pytest 4 | 5 | from pyvisionai.core.extractor import BaseExtractor 6 | 7 | 8 | def test_cannot_instantiate_base_extractor(): 9 | """Test that BaseExtractor cannot be instantiated directly.""" 10 | with pytest.raises( 11 | TypeError, match=r"Can't instantiate abstract class" 12 | ): 13 | BaseExtractor() 14 | 15 | 16 | def test_must_implement_extract(): 17 | """Test that concrete classes must implement the extract method.""" 18 | 19 | class IncompleteExtractor(BaseExtractor): 20 | pass 21 | 22 | with pytest.raises( 23 | TypeError, match=r"Can't instantiate abstract class" 24 | ): 25 | IncompleteExtractor() 26 | 27 | 28 | def test_concrete_implementation(): 29 | """Test that a concrete implementation with extract method can be instantiated.""" 30 | 31 | class ConcreteExtractor(BaseExtractor): 32 | def extract(self, file_path: str, output_dir: str) -> str: 33 | return "test.md" 34 | 35 | extractor = ConcreteExtractor() 36 | assert isinstance(extractor, BaseExtractor) 37 | result = extractor.extract("test.pdf", "output/") 38 | assert isinstance(result, str) 39 | assert result == "test.md" 40 | 41 | 42 | def test_extract_method_interface(): 43 | """Test that the extract method follows the expected interface.""" 44 | 45 | class TestExtractor(BaseExtractor): 46 | def extract(self, file_path: str, output_dir: str) -> str: 47 | assert isinstance( 48 | file_path, str 49 | ), "file_path must be a string" 50 | assert isinstance( 51 | output_dir, str 52 | ), "output_dir must be a string" 53 | return "test.md" 54 | 55 | extractor = TestExtractor() 56 | result = extractor.extract("test.pdf", "output/") 57 | assert isinstance(result, str) 58 | 59 | 60 | def test_extract_method_documentation(): 61 | """Test that the extract method has proper documentation.""" 62 | assert BaseExtractor.extract.__doc__ is not None 63 | assert "file_path" in BaseExtractor.extract.__doc__ 64 | assert "output_dir" in BaseExtractor.extract.__doc__ 65 | assert "Returns" in BaseExtractor.extract.__doc__ 66 | -------------------------------------------------------------------------------- /tests/data/chart.png: -------------------------------------------------------------------------------- 1 | Chart image content -------------------------------------------------------------------------------- /tests/data/charts.pptx: -------------------------------------------------------------------------------- 1 | Business presentation content 2 | -------------------------------------------------------------------------------- /tests/data/report.docx: -------------------------------------------------------------------------------- 1 | Report with tables 2 | -------------------------------------------------------------------------------- /tests/data/sample.docx: -------------------------------------------------------------------------------- 1 | Sample DOCX content 2 | -------------------------------------------------------------------------------- /tests/data/sample.pdf: -------------------------------------------------------------------------------- 1 | Sample PDF content -------------------------------------------------------------------------------- /tests/data/sample.pptx: -------------------------------------------------------------------------------- 1 | Sample PPTX content 2 | -------------------------------------------------------------------------------- /tests/data/sample_image.jpg: -------------------------------------------------------------------------------- 1 | Sample image content -------------------------------------------------------------------------------- /tests/data/technical_doc.pdf: -------------------------------------------------------------------------------- 1 | Technical documentation content -------------------------------------------------------------------------------- /tests/describers/test_api_retry.py: -------------------------------------------------------------------------------- 1 | """Tests for API retry behavior.""" 2 | 3 | from unittest.mock import MagicMock, patch 4 | 5 | import pytest 6 | import requests 7 | from openai import OpenAIError 8 | 9 | from pyvisionai.describers.ollama import LlamaVisionModel 10 | from pyvisionai.describers.openai import GPT4VisionModel 11 | from pyvisionai.utils.retry import ConnectionError 12 | 13 | 14 | @pytest.fixture 15 | def mock_image_data(): 16 | """Mock image data for testing.""" 17 | return b"mock_image_bytes" 18 | 19 | 20 | @pytest.fixture 21 | def llama_model(): 22 | """Create a LlamaVisionModel instance.""" 23 | return LlamaVisionModel() 24 | 25 | 26 | @pytest.fixture 27 | def gpt4_model(): 28 | """Create a GPT4VisionModel instance.""" 29 | return GPT4VisionModel(api_key="test_key") 30 | 31 | 32 | def test_ollama_retry_connection_error(llama_model, mock_image_data): 33 | """Test retry on Ollama connection error.""" 34 | with ( 35 | patch("requests.post") as mock_post, 36 | patch("builtins.open", create=True) as mock_open, 37 | ): 38 | # Mock file reading 39 | mock_file = MagicMock() 40 | mock_file.read.return_value = mock_image_data 41 | mock_open.return_value.__enter__.return_value = mock_file 42 | 43 | # Mock connection error twice, then success 44 | mock_post.side_effect = [ 45 | requests.exceptions.ConnectionError("Connection refused"), 46 | requests.exceptions.ConnectionError("Connection refused"), 47 | MagicMock( 48 | json=lambda: {"response": "Success response"}, 49 | raise_for_status=lambda: None, 50 | ), 51 | ] 52 | 53 | result = llama_model.describe_image("test.jpg") 54 | assert result == "Success response" 55 | assert mock_post.call_count == 3 56 | 57 | 58 | def test_ollama_retry_rate_limit(llama_model, mock_image_data): 59 | """Test retry on Ollama rate limit.""" 60 | with ( 61 | patch("requests.post") as mock_post, 62 | patch("builtins.open", create=True) as mock_open, 63 | ): 64 | # Mock file reading 65 | mock_file = MagicMock() 66 | mock_file.read.return_value = mock_image_data 67 | mock_open.return_value.__enter__.return_value = mock_file 68 | 69 | # Create mock responses 70 | error_response = MagicMock() 71 | error_response.raise_for_status.side_effect = ( 72 | requests.exceptions.HTTPError( 73 | response=MagicMock(status_code=429) 74 | ) 75 | ) 76 | success_response = MagicMock( 77 | json=lambda: {"response": "Success response"}, 78 | raise_for_status=lambda: None, 79 | ) 80 | 81 | # Mock rate limit twice, then success 82 | mock_post.side_effect = [ 83 | error_response, 84 | error_response, 85 | success_response, 86 | ] 87 | 88 | result = llama_model.describe_image("test.jpg") 89 | assert result == "Success response" 90 | assert mock_post.call_count == 3 91 | 92 | 93 | def test_gpt4_retry_rate_limit(gpt4_model, mock_image_data): 94 | """Test retry on OpenAI rate limit.""" 95 | with ( 96 | patch("builtins.open", create=True) as mock_open, 97 | patch( 98 | "pyvisionai.describers.openai.OpenAI" 99 | ) as mock_openai_class, 100 | ): 101 | # Mock file reading 102 | mock_file = MagicMock() 103 | mock_file.read.return_value = mock_image_data 104 | mock_open.return_value.__enter__.return_value = mock_file 105 | 106 | # Mock OpenAI client 107 | mock_client = MagicMock() 108 | mock_completions = MagicMock() 109 | mock_client.chat.completions = mock_completions 110 | mock_openai_class.return_value = mock_client 111 | 112 | # Create mock response 113 | mock_response = MagicMock() 114 | mock_response.choices = [ 115 | MagicMock(message=MagicMock(content="Success response")) 116 | ] 117 | 118 | # Mock rate limit twice, then success 119 | mock_completions.create.side_effect = [ 120 | OpenAIError("Rate limit exceeded"), 121 | OpenAIError("Rate limit exceeded"), 122 | mock_response, 123 | ] 124 | 125 | result = gpt4_model.describe_image("test.jpg") 126 | assert result == "Success response" 127 | assert mock_completions.create.call_count == 3 128 | 129 | 130 | def test_gpt4_retry_server_error(gpt4_model, mock_image_data): 131 | """Test retry on OpenAI server error.""" 132 | with ( 133 | patch("builtins.open", create=True) as mock_open, 134 | patch( 135 | "pyvisionai.describers.openai.OpenAI" 136 | ) as mock_openai_class, 137 | ): 138 | # Mock file reading 139 | mock_file = MagicMock() 140 | mock_file.read.return_value = mock_image_data 141 | mock_open.return_value.__enter__.return_value = mock_file 142 | 143 | # Mock OpenAI client 144 | mock_client = MagicMock() 145 | mock_completions = MagicMock() 146 | mock_client.chat.completions = mock_completions 147 | mock_openai_class.return_value = mock_client 148 | 149 | # Create mock response 150 | mock_response = MagicMock() 151 | mock_response.choices = [ 152 | MagicMock(message=MagicMock(content="Success response")) 153 | ] 154 | 155 | # Mock server error twice, then success 156 | mock_completions.create.side_effect = [ 157 | OpenAIError("Internal server error"), 158 | OpenAIError("Internal server error"), 159 | mock_response, 160 | ] 161 | 162 | result = gpt4_model.describe_image("test.jpg") 163 | assert result == "Success response" 164 | assert mock_completions.create.call_count == 3 165 | 166 | 167 | def test_max_retries_exceeded(llama_model, mock_image_data): 168 | """Test failure after max retries.""" 169 | with ( 170 | patch("requests.post") as mock_post, 171 | patch("builtins.open", create=True) as mock_open, 172 | ): 173 | # Mock file reading 174 | mock_file = MagicMock() 175 | mock_file.read.return_value = mock_image_data 176 | mock_open.return_value.__enter__.return_value = mock_file 177 | 178 | # Mock connection error consistently 179 | mock_post.side_effect = requests.exceptions.ConnectionError( 180 | "Connection refused" 181 | ) 182 | 183 | with pytest.raises(ConnectionError, match="Connection refused"): 184 | llama_model.describe_image("test.jpg") 185 | 186 | assert mock_post.call_count == 3 # Initial attempt + 2 retries 187 | -------------------------------------------------------------------------------- /tests/describers/test_claude.py: -------------------------------------------------------------------------------- 1 | """Unit tests for Claude Vision model.""" 2 | 3 | import logging 4 | import os 5 | from unittest.mock import MagicMock, patch 6 | 7 | import pytest 8 | from anthropic import APIError, AuthenticationError 9 | 10 | from pyvisionai.describers.claude import ClaudeVisionModel 11 | from pyvisionai.utils.retry import ConnectionError 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | @pytest.fixture 17 | def claude_model(): 18 | """Create a ClaudeVisionModel instance.""" 19 | api_key = os.getenv("ANTHROPIC_API_KEY", "test_key") 20 | return ClaudeVisionModel(api_key=api_key) 21 | 22 | 23 | @pytest.fixture 24 | def mock_anthropic_setup(): 25 | """Set up common Anthropic mocking.""" 26 | with ( 27 | patch("builtins.open", create=True) as mock_open, 28 | patch( 29 | "pyvisionai.describers.claude.Anthropic" 30 | ) as mock_anthropic_class, 31 | ): 32 | # Mock file reading 33 | mock_file = MagicMock() 34 | mock_file.read.return_value = b"mock_image_bytes" 35 | mock_open.return_value.__enter__.return_value = mock_file 36 | 37 | # Mock Anthropic client 38 | mock_client = MagicMock() 39 | mock_messages = MagicMock() 40 | mock_client.messages = mock_messages 41 | mock_anthropic_class.return_value = mock_client 42 | 43 | yield { 44 | "mock_open": mock_open, 45 | "mock_client": mock_client, 46 | "mock_messages": mock_messages, 47 | } 48 | 49 | 50 | def create_api_error(message: str) -> APIError: 51 | """Create a mock Anthropic APIError.""" 52 | mock_response = MagicMock() 53 | mock_response.status_code = ( 54 | 429 if "rate limit" in message.lower() else 500 55 | ) 56 | mock_response.text = message 57 | return APIError( 58 | message=message, 59 | request=MagicMock(), 60 | body={"error": {"message": message}}, 61 | ) 62 | 63 | 64 | class TestClaudeVisionModel: 65 | """Test suite for Claude Vision model.""" 66 | 67 | def test_init(self, claude_model): 68 | """Test model initialization.""" 69 | assert claude_model.api_key == os.getenv( 70 | "ANTHROPIC_API_KEY", "test_key" 71 | ) 72 | assert claude_model.prompt is None 73 | 74 | def test_validate_config_with_key(self, claude_model): 75 | """Test configuration validation with API key.""" 76 | claude_model.validate_config() # Should not raise 77 | 78 | def test_validate_config_without_key(self): 79 | """Test configuration validation without API key.""" 80 | model = ClaudeVisionModel(api_key=None) 81 | with pytest.raises( 82 | ValueError, match="Anthropic API key is required" 83 | ): 84 | model.validate_config() 85 | 86 | def test_retry_rate_limit( 87 | self, claude_model, mock_anthropic_setup, test_image_path 88 | ): 89 | """Test retry on rate limit errors.""" 90 | mock_messages = mock_anthropic_setup["mock_messages"] 91 | 92 | # Create mock response 93 | mock_response = MagicMock() 94 | mock_response.content = [MagicMock(text="Success response")] 95 | 96 | # Mock rate limit twice, then success 97 | mock_messages.create.side_effect = [ 98 | create_api_error("Rate limit exceeded"), 99 | create_api_error("Rate limit exceeded"), 100 | mock_response, 101 | ] 102 | 103 | result = claude_model.describe_image(test_image_path) 104 | assert result == "Success response" 105 | assert mock_messages.create.call_count == 3 106 | 107 | def test_retry_server_error( 108 | self, claude_model, mock_anthropic_setup, test_image_path 109 | ): 110 | """Test retry on server errors.""" 111 | mock_messages = mock_anthropic_setup["mock_messages"] 112 | 113 | # Create mock response 114 | mock_response = MagicMock() 115 | mock_response.content = [MagicMock(text="Success response")] 116 | 117 | # Mock server error twice, then success 118 | mock_messages.create.side_effect = [ 119 | create_api_error("Internal server error"), 120 | create_api_error("Internal server error"), 121 | mock_response, 122 | ] 123 | 124 | result = claude_model.describe_image(test_image_path) 125 | assert result == "Success response" 126 | assert mock_messages.create.call_count == 3 127 | 128 | def test_retry_overloaded( 129 | self, claude_model, mock_anthropic_setup, test_image_path 130 | ): 131 | """Test retry on overloaded errors.""" 132 | mock_messages = mock_anthropic_setup["mock_messages"] 133 | 134 | # Create mock response 135 | mock_response = MagicMock() 136 | mock_response.content = [MagicMock(text="Success response")] 137 | 138 | # Mock overloaded error twice, then success 139 | mock_messages.create.side_effect = [ 140 | create_api_error("Error code: 529 - Overloaded"), 141 | create_api_error("Error code: 529 - Overloaded"), 142 | mock_response, 143 | ] 144 | 145 | result = claude_model.describe_image(test_image_path) 146 | assert result == "Success response" 147 | assert mock_messages.create.call_count == 3 148 | 149 | def test_max_retries_exceeded( 150 | self, claude_model, mock_anthropic_setup, test_image_path 151 | ): 152 | """Test failure after max retries.""" 153 | mock_messages = mock_anthropic_setup["mock_messages"] 154 | 155 | # Mock consistent failure 156 | mock_messages.create.side_effect = create_api_error( 157 | "Rate limit exceeded" 158 | ) 159 | 160 | with pytest.raises( 161 | ConnectionError, match="Rate limit exceeded" 162 | ): 163 | claude_model.describe_image(test_image_path) 164 | assert ( 165 | mock_messages.create.call_count == 3 166 | ) # Initial attempt + 2 retries 167 | 168 | def test_empty_response( 169 | self, claude_model, mock_anthropic_setup, test_image_path 170 | ): 171 | """Test handling of empty response.""" 172 | mock_messages = mock_anthropic_setup["mock_messages"] 173 | 174 | # Create mock response with empty content 175 | mock_response = MagicMock() 176 | mock_response.content = [MagicMock(text="")] 177 | mock_messages.create.return_value = mock_response 178 | 179 | with pytest.raises( 180 | ValueError, match="No description generated" 181 | ): 182 | claude_model.describe_image(test_image_path) 183 | 184 | @pytest.mark.integration 185 | def test_real_api_call(self, test_image_path): 186 | """Test actual API integration.""" 187 | api_key = os.getenv("ANTHROPIC_API_KEY") 188 | if not api_key: 189 | pytest.skip("Skipping test - No Anthropic API key provided") 190 | 191 | model = ClaudeVisionModel(api_key=api_key) 192 | description = model.describe_image(test_image_path) 193 | assert len(description) > 100, "Description seems too short" 194 | assert any( 195 | term in description.lower() for term in ["forest", "tree"] 196 | ), "Expected forest scene description not found" 197 | -------------------------------------------------------------------------------- /tests/describers/test_ollama.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MDGrey33/pyvisionai/da23f86e12e6576f5f65bf39f32f424896d853b3/tests/describers/test_ollama.py -------------------------------------------------------------------------------- /tests/describers/test_openai.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MDGrey33/pyvisionai/da23f86e12e6576f5f65bf39f32f424896d853b3/tests/describers/test_openai.py -------------------------------------------------------------------------------- /tests/file_extraction/test_extraction_cli.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MDGrey33/pyvisionai/da23f86e12e6576f5f65bf39f32f424896d853b3/tests/file_extraction/test_extraction_cli.py -------------------------------------------------------------------------------- /tests/file_extraction/test_extraction_lib.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MDGrey33/pyvisionai/da23f86e12e6576f5f65bf39f32f424896d853b3/tests/file_extraction/test_extraction_lib.py -------------------------------------------------------------------------------- /tests/test_batch_processing.py: -------------------------------------------------------------------------------- 1 | """Tests for batch processing example.""" 2 | 3 | import os 4 | import shutil 5 | import tempfile 6 | from unittest.mock import Mock, patch 7 | 8 | import pytest 9 | 10 | from examples.batch_processing import BatchProcessor 11 | 12 | # Test data paths 13 | TEST_DATA_DIR = os.path.join("tests", "data", "batch") 14 | 15 | 16 | @pytest.fixture 17 | def temp_output_dir(): 18 | """Create a temporary directory for test outputs.""" 19 | temp_dir = tempfile.mkdtemp() 20 | yield temp_dir 21 | shutil.rmtree(temp_dir) 22 | 23 | 24 | @pytest.fixture 25 | def setup_test_files(temp_output_dir): 26 | """Set up test files for batch processing.""" 27 | os.makedirs(TEST_DATA_DIR, exist_ok=True) 28 | 29 | # Create sample files of different types 30 | files = { 31 | "doc1.pdf": "PDF content", 32 | "doc2.docx": "DOCX content", 33 | "doc3.pptx": "PPTX content", 34 | "page.html": "HTML content", 35 | "ignored.txt": "Text content", 36 | } 37 | 38 | for filename, content in files.items(): 39 | filepath = os.path.join(TEST_DATA_DIR, filename) 40 | with open(filepath, "w") as f: 41 | f.write(content) 42 | 43 | yield 44 | 45 | # Cleanup 46 | shutil.rmtree(TEST_DATA_DIR) 47 | 48 | 49 | def test_batch_processor_init(): 50 | """Test BatchProcessor initialization.""" 51 | processor = BatchProcessor(max_workers=2) 52 | assert processor.max_workers == 2 53 | assert len(processor.extractors) == 4 # pdf, docx, pptx, html 54 | assert all( 55 | ext in processor.extractors 56 | for ext in [".pdf", ".docx", ".pptx", ".html"] 57 | ) 58 | 59 | 60 | @patch("examples.batch_processing.create_extractor") 61 | def test_process_file_success( 62 | mock_create_extractor, temp_output_dir, setup_test_files 63 | ): 64 | """Test successful file processing.""" 65 | # Setup mock 66 | mock_extractor = Mock() 67 | mock_extractor.extract.return_value = "output.md" 68 | mock_create_extractor.return_value = mock_extractor 69 | 70 | # Create processor 71 | processor = BatchProcessor() 72 | 73 | # Process PDF file 74 | input_file = os.path.join(TEST_DATA_DIR, "doc1.pdf") 75 | filename, success, message = processor.process_file( 76 | input_file, temp_output_dir 77 | ) 78 | 79 | # Verify 80 | assert filename == "doc1.pdf" 81 | assert success is True 82 | assert "Processed successfully" in message 83 | mock_extractor.extract.assert_called_once() 84 | 85 | 86 | def test_process_file_unsupported(temp_output_dir, setup_test_files): 87 | """Test processing unsupported file type.""" 88 | processor = BatchProcessor() 89 | 90 | # Try to process text file 91 | input_file = os.path.join(TEST_DATA_DIR, "ignored.txt") 92 | filename, success, message = processor.process_file( 93 | input_file, temp_output_dir 94 | ) 95 | 96 | # Verify 97 | assert filename == "ignored.txt" 98 | assert success is False 99 | assert message == "Unsupported file type" 100 | 101 | 102 | @patch("examples.batch_processing.create_extractor") 103 | def test_process_file_error( 104 | mock_create_extractor, temp_output_dir, setup_test_files 105 | ): 106 | """Test handling of processing errors.""" 107 | # Setup mock to raise exception 108 | mock_extractor = Mock() 109 | mock_extractor.extract.side_effect = Exception("Processing failed") 110 | mock_create_extractor.return_value = mock_extractor 111 | 112 | # Create processor and process file 113 | processor = BatchProcessor() 114 | input_file = os.path.join(TEST_DATA_DIR, "doc1.pdf") 115 | filename, success, message = processor.process_file( 116 | input_file, temp_output_dir 117 | ) 118 | 119 | # Verify 120 | assert filename == "doc1.pdf" 121 | assert success is False 122 | assert "Error: Processing failed" in message 123 | 124 | 125 | @patch("examples.batch_processing.create_extractor") 126 | def test_process_directory( 127 | mock_create_extractor, temp_output_dir, setup_test_files 128 | ): 129 | """Test processing entire directory.""" 130 | # Setup mock 131 | mock_extractor = Mock() 132 | mock_extractor.extract.return_value = "output.md" 133 | mock_create_extractor.return_value = mock_extractor 134 | 135 | # Create processor and process directory 136 | processor = BatchProcessor(max_workers=2) 137 | successful, failed, errors = processor.process_directory( 138 | TEST_DATA_DIR, temp_output_dir 139 | ) 140 | 141 | # Verify 142 | assert successful == 4 # pdf, docx, pptx, html 143 | assert failed == 0 144 | assert len(errors) == 0 145 | assert mock_extractor.extract.call_count == 4 146 | 147 | 148 | def test_process_directory_empty(temp_output_dir): 149 | """Test processing empty directory.""" 150 | # Create empty directory 151 | empty_dir = os.path.join(temp_output_dir, "empty") 152 | os.makedirs(empty_dir) 153 | 154 | # Process empty directory 155 | processor = BatchProcessor() 156 | successful, failed, errors = processor.process_directory( 157 | empty_dir, temp_output_dir 158 | ) 159 | 160 | # Verify 161 | assert successful == 0 162 | assert failed == 0 163 | assert len(errors) == 1 164 | assert errors[0] == "No files found to process" 165 | 166 | 167 | @patch("examples.batch_processing.create_extractor") 168 | def test_process_directory_filtered( 169 | mock_create_extractor, temp_output_dir, setup_test_files 170 | ): 171 | """Test processing directory with file type filter.""" 172 | # Setup mock 173 | mock_extractor = Mock() 174 | mock_extractor.extract.return_value = "output.md" 175 | mock_create_extractor.return_value = mock_extractor 176 | 177 | processor = BatchProcessor() 178 | 179 | # Process only PDF files 180 | successful, failed, errors = processor.process_directory( 181 | TEST_DATA_DIR, temp_output_dir, file_types=[".pdf"] 182 | ) 183 | 184 | # Verify 185 | assert successful == 1 # Only PDF 186 | assert failed == 0 187 | assert len(errors) == 0 188 | # Verify that only PDF file was processed 189 | mock_extractor.extract.assert_called_once() 190 | args = mock_extractor.extract.call_args[0] 191 | assert args[0].endswith(".pdf") 192 | 193 | 194 | @patch("examples.batch_processing.create_extractor") 195 | def test_parallel_processing( 196 | mock_create_extractor, temp_output_dir, setup_test_files 197 | ): 198 | """Test parallel processing of files.""" 199 | # Setup mock 200 | mock_extractor = Mock() 201 | mock_extractor.extract.return_value = "output.md" 202 | mock_create_extractor.return_value = mock_extractor 203 | 204 | # Process with multiple workers 205 | processor = BatchProcessor(max_workers=4) 206 | successful, failed, errors = processor.process_directory( 207 | TEST_DATA_DIR, temp_output_dir 208 | ) 209 | 210 | # Verify 211 | assert successful == 4 212 | assert failed == 0 213 | assert len(errors) == 0 214 | assert mock_extractor.extract.call_count == 4 215 | -------------------------------------------------------------------------------- /tests/test_benchmarks.py: -------------------------------------------------------------------------------- 1 | """Tests for benchmark logging functionality.""" 2 | 3 | import json 4 | from datetime import datetime 5 | from pathlib import Path 6 | 7 | import pytest 8 | 9 | from pyvisionai.utils.benchmark import ( 10 | BenchmarkEntry, 11 | BenchmarkLogger, 12 | BenchmarkMetrics, 13 | ) 14 | from tests.conftest import log_benchmark 15 | 16 | 17 | @pytest.fixture 18 | def sample_benchmark_data(benchmark_log_file): 19 | """Generate sample benchmark data for testing.""" 20 | log_dir = benchmark_log_file.parent 21 | 22 | # Generate CLI benchmark 23 | log_benchmark( 24 | "pdf", 25 | "page_as_image", 26 | { 27 | "interface": "cli", 28 | "cli_time": 13.5, 29 | "output_size": 2500, 30 | }, 31 | log_dir=log_dir, 32 | ) 33 | 34 | # Generate API benchmark 35 | log_benchmark( 36 | "docx", 37 | "text_and_images", 38 | { 39 | "interface": "api", 40 | "setup_time": 0.1, 41 | "extraction_time": 5.2, 42 | "output_size": 1800, 43 | }, 44 | log_dir=log_dir, 45 | ) 46 | 47 | 48 | def test_benchmark_log_structure( 49 | sample_benchmark_data, benchmark_log_file 50 | ): 51 | """Verify that benchmark logs are being created with correct structure.""" 52 | assert ( 53 | benchmark_log_file.exists() 54 | ), "Benchmark log file should be created" 55 | 56 | with open(benchmark_log_file, "r") as f: 57 | lines = f.readlines() 58 | assert len(lines) >= 2, "Expected at least 2 benchmark entries" 59 | 60 | # Parse entries 61 | entries = [ 62 | BenchmarkEntry.from_dict(json.loads(line)) for line in lines 63 | ] 64 | cli_entries = [ 65 | e for e in entries if e.metrics.interface == "cli" 66 | ] 67 | api_entries = [ 68 | e for e in entries if e.metrics.interface == "api" 69 | ] 70 | 71 | assert cli_entries, "Expected at least one CLI benchmark entry" 72 | assert api_entries, "Expected at least one API benchmark entry" 73 | 74 | # Test CLI benchmark entry 75 | cli_entry = cli_entries[-1] 76 | assert cli_entry.test["file_type"] == "pdf" 77 | assert cli_entry.test["method"] == "page_as_image" 78 | assert cli_entry.metrics.cli_time is not None 79 | assert cli_entry.metrics.output_size > 0 80 | 81 | # Test API benchmark entry 82 | api_entry = api_entries[-1] 83 | assert api_entry.test["file_type"] == "docx" 84 | assert api_entry.test["method"] == "text_and_images" 85 | assert api_entry.metrics.setup_time is not None 86 | assert api_entry.metrics.extraction_time is not None 87 | assert api_entry.metrics.output_size > 0 88 | 89 | 90 | def test_benchmark_metrics_validation(): 91 | """Test validation of benchmark metrics.""" 92 | # Test invalid output size 93 | with pytest.raises( 94 | ValueError, match="output_size must be a non-negative integer" 95 | ): 96 | BenchmarkMetrics( 97 | interface="cli", output_size=-1, cli_time=1.0 98 | ).validate() 99 | 100 | # Test invalid interface 101 | with pytest.raises( 102 | ValueError, match="interface must be either 'cli' or 'api'" 103 | ): 104 | BenchmarkMetrics( 105 | interface="invalid", output_size=100, cli_time=1.0 106 | ).validate() 107 | 108 | # Test missing cli_time for CLI interface 109 | with pytest.raises( 110 | ValueError, match="cli_time is required for CLI interface" 111 | ): 112 | BenchmarkMetrics(interface="cli", output_size=100).validate() 113 | 114 | # Test missing extraction_time for API interface 115 | with pytest.raises( 116 | ValueError, 117 | match="extraction_time is required for API interface", 118 | ): 119 | BenchmarkMetrics(interface="api", output_size=100).validate() 120 | 121 | 122 | def test_benchmark_normalization(benchmark_log_file): 123 | """Test normalization of benchmark metrics.""" 124 | log_dir = benchmark_log_file.parent 125 | 126 | # Test CLI metrics with total_time 127 | log_benchmark( 128 | "pdf", 129 | "page_as_image", 130 | { 131 | "interface": "cli", 132 | "total_time": 10.0, 133 | "output_size": 1000, 134 | }, 135 | log_dir=log_dir, 136 | ) 137 | 138 | # Test API metrics without setup_time 139 | log_benchmark( 140 | "docx", 141 | "text_and_images", 142 | { 143 | "interface": "api", 144 | "extraction_time": 5.0, 145 | "output_size": 1000, 146 | }, 147 | log_dir=log_dir, 148 | ) 149 | 150 | with open(benchmark_log_file, "r") as f: 151 | lines = f.readlines() 152 | entries = [ 153 | BenchmarkEntry.from_dict(json.loads(line)) for line in lines 154 | ] 155 | 156 | # Check CLI metrics normalization 157 | cli_entry = entries[0] 158 | assert cli_entry.metrics.cli_time == 10.0 159 | 160 | # Check API metrics normalization 161 | api_entry = entries[1] 162 | assert api_entry.metrics.setup_time == 0 163 | assert api_entry.metrics.cli_time == 5.0 164 | 165 | 166 | def test_concurrent_logging(benchmark_log_file): 167 | """Test concurrent logging with file locking.""" 168 | from concurrent.futures import ThreadPoolExecutor 169 | from threading import Event 170 | 171 | log_dir = benchmark_log_file.parent 172 | start_event = Event() 173 | logs_written = [] 174 | 175 | def log_concurrently(): 176 | start_event.wait() 177 | try: 178 | log_benchmark( 179 | "test", 180 | "concurrent", 181 | { 182 | "interface": "cli", 183 | "cli_time": 1.0, 184 | "output_size": 100, 185 | }, 186 | log_dir=log_dir, 187 | ) 188 | logs_written.append(True) 189 | except Exception: 190 | logs_written.append(False) 191 | 192 | # Start multiple threads 193 | with ThreadPoolExecutor(max_workers=5) as executor: 194 | futures = [executor.submit(log_concurrently) for _ in range(5)] 195 | start_event.set() # Start all threads simultaneously 196 | 197 | # Verify all logs were written successfully 198 | assert all(logs_written) 199 | 200 | # Verify log file integrity 201 | with open(benchmark_log_file, "r") as f: 202 | lines = f.readlines() 203 | assert len(lines) == 5 204 | # Verify each line is valid JSON 205 | for line in lines: 206 | entry = BenchmarkEntry.from_dict(json.loads(line)) 207 | assert entry.metrics.cli_time == 1.0 208 | assert entry.metrics.output_size == 100 209 | -------------------------------------------------------------------------------- /tests/test_extraction_cli.py: -------------------------------------------------------------------------------- 1 | """CLI tests for file extraction functionality.""" 2 | 3 | import logging 4 | import os 5 | import subprocess 6 | import time 7 | 8 | import pytest 9 | 10 | from tests.conftest import ( 11 | ids_file_extraction, 12 | log_benchmark, 13 | testdata_file_extraction, 14 | ) 15 | from tests.utils.metrics import print_performance_metrics 16 | from tests.utils.verifiers import ( 17 | content_verifiers, 18 | verify_basic_content, 19 | ) 20 | 21 | logger = logging.getLogger(__name__) 22 | 23 | 24 | @pytest.fixture(autouse=True) 25 | def setup_test_logging(): 26 | """Configure test-specific logging.""" 27 | logger.setLevel(logging.ERROR) 28 | return logger 29 | 30 | 31 | @pytest.mark.parametrize( 32 | "file_type,method", 33 | testdata_file_extraction, 34 | ids=ids_file_extraction, 35 | ) 36 | def test_file_extraction_cli(file_type, method, setup_test_env): 37 | """Test file extraction using CLI.""" 38 | logger.info( 39 | f"Starting CLI test for {file_type} using {method} method" 40 | ) 41 | 42 | # Skip if no API key is provided for text_and_images method 43 | if method == "text_and_images": 44 | api_key = os.getenv("OPENAI_API_KEY") 45 | if not api_key: 46 | logger.info("Skipping GPT-4 test - No API key provided") 47 | pytest.skip("Skipping GPT-4 test - No API key provided") 48 | 49 | # Setup 50 | filename = "test" 51 | source_file = os.path.join( 52 | setup_test_env["source_dir"], f"{filename}.{file_type}" 53 | ) 54 | # Create unique output directory for this test 55 | test_output_dir = os.path.join( 56 | setup_test_env["extracted_dir"], f"{file_type}_{method}" 57 | ) 58 | os.makedirs(test_output_dir, exist_ok=True) 59 | 60 | logger.debug(f"Source file: {source_file}") 61 | logger.debug(f"Output directory: {test_output_dir}") 62 | 63 | # Test CLI performance 64 | start_time = time.time() 65 | cmd = [ 66 | "file-extract", 67 | "--type", 68 | file_type, 69 | "--source", 70 | source_file, 71 | "--output", 72 | test_output_dir, 73 | "--extractor", 74 | method, 75 | ] 76 | 77 | # Add API key for text_and_images method 78 | if method == "text_and_images": 79 | cmd.extend(["--api-key", api_key]) 80 | logger.debug("Added API key to command") 81 | 82 | logger.debug(f"Running command: {' '.join(cmd)}") 83 | result = subprocess.run(cmd, capture_output=True, text=True) 84 | cli_time = time.time() - start_time 85 | 86 | # Log any errors 87 | if result.stderr: 88 | logger.error(f"CLI error output: {result.stderr}") 89 | if result.stdout: 90 | logger.debug(f"CLI output: {result.stdout}") 91 | 92 | # Get output path 93 | base_name = os.path.splitext(os.path.basename(source_file))[0] 94 | output_path = os.path.join( 95 | test_output_dir, f"{base_name}_{file_type}.md" 96 | ) 97 | output_size = ( 98 | os.path.getsize(output_path) 99 | if os.path.exists(output_path) 100 | else 0 101 | ) 102 | 103 | logger.debug(f"Output file size: {output_size} bytes") 104 | logger.debug(f"CLI execution time: {cli_time:.2f} seconds") 105 | 106 | # Log CLI benchmark results 107 | log_benchmark( 108 | file_type, 109 | method, 110 | { 111 | "cli_time": cli_time, 112 | "output_size": output_size, 113 | }, 114 | ) 115 | 116 | # Print performance metrics 117 | print_performance_metrics( 118 | file_type=file_type, 119 | method=method, 120 | setup_time=0, 121 | extraction_time=cli_time, 122 | output_size=output_size, 123 | interface="CLI", 124 | ) 125 | 126 | # Verify output 127 | output_path = os.path.join(test_output_dir, f"test_{file_type}.md") 128 | logger.debug(f"Verifying output file: {output_path}") 129 | 130 | with open(output_path, "r") as f: 131 | content = f.read() 132 | verify_basic_content(content) 133 | if file_type in content_verifiers: 134 | content_verifiers[file_type](content) 135 | 136 | logger.info( 137 | f"CLI test for {file_type} using {method} method completed successfully" 138 | ) 139 | -------------------------------------------------------------------------------- /tests/test_extraction_lib.py: -------------------------------------------------------------------------------- 1 | """Library API tests for file extraction functionality.""" 2 | 3 | import os 4 | import time 5 | 6 | import pytest 7 | 8 | from pyvisionai import create_extractor 9 | from tests.conftest import ( 10 | ids_file_extraction, 11 | log_benchmark, 12 | testdata_file_extraction, 13 | ) 14 | from tests.utils.metrics import print_performance_metrics 15 | from tests.utils.verifiers import ( 16 | content_verifiers, 17 | verify_basic_content, 18 | ) 19 | 20 | 21 | @pytest.mark.parametrize( 22 | "file_type,method", 23 | testdata_file_extraction, 24 | ids=ids_file_extraction, 25 | ) 26 | def test_file_extraction_lib(file_type, method, setup_test_env): 27 | """Test file extraction using library API.""" 28 | # Setup 29 | filename = "test" 30 | source_file = os.path.join( 31 | setup_test_env["source_dir"], f"{filename}.{file_type}" 32 | ) 33 | # Create unique output directory for this test 34 | test_output_dir = os.path.join( 35 | setup_test_env["extracted_dir"], f"{file_type}_{method}" 36 | ) 37 | os.makedirs(test_output_dir, exist_ok=True) 38 | 39 | # Test API performance and functionality 40 | start_time = time.time() 41 | extractor = create_extractor(file_type, method) 42 | setup_time = time.time() - start_time 43 | 44 | start_time = time.time() 45 | output_path = extractor.extract(source_file, test_output_dir) 46 | extraction_time = time.time() - start_time 47 | 48 | # Measure output size 49 | output_size = ( 50 | os.path.getsize(output_path) 51 | if os.path.exists(output_path) 52 | else 0 53 | ) 54 | 55 | # Log API benchmark results 56 | log_benchmark( 57 | file_type, 58 | method, 59 | { 60 | "setup_time": setup_time, 61 | "extraction_time": extraction_time, 62 | "output_size": output_size, 63 | }, 64 | ) 65 | 66 | # Print performance metrics 67 | print_performance_metrics( 68 | file_type=file_type, 69 | method=method, 70 | setup_time=setup_time, 71 | extraction_time=extraction_time, 72 | output_size=output_size, 73 | interface="API", 74 | ) 75 | 76 | # Verify output 77 | with open(output_path, "r") as f: 78 | content = f.read() 79 | verify_basic_content(content) 80 | if file_type in content_verifiers: 81 | content_verifiers[file_type](content) 82 | -------------------------------------------------------------------------------- /tests/test_integration.py: -------------------------------------------------------------------------------- 1 | """Integration tests for image description functionality.""" 2 | 3 | import logging 4 | 5 | import pytest 6 | 7 | from pyvisionai import describe_image_ollama, describe_image_openai 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | @pytest.mark.integration 13 | class TestImageDescription: 14 | """Integration tests for image description.""" 15 | 16 | @pytest.mark.openai 17 | def test_openai_description(self, test_image_path): 18 | """Test OpenAI image description.""" 19 | description = describe_image_openai(test_image_path) 20 | assert len(description) > 100, "Description seems too short" 21 | assert any( 22 | term in description.lower() for term in ["forest", "tree"] 23 | ), "Expected forest scene description not found" 24 | 25 | @pytest.mark.ollama 26 | def test_ollama_description(self, test_image_path): 27 | """Test Ollama image description.""" 28 | description = describe_image_ollama(test_image_path) 29 | assert len(description) > 100, "Description seems too short" 30 | assert any( 31 | term in description.lower() for term in ["forest", "tree"] 32 | ), "Expected forest scene description not found" 33 | 34 | @pytest.mark.openai 35 | def test_openai_custom_prompt(self, test_image_path): 36 | """Test OpenAI with custom prompt.""" 37 | custom_prompt = "List the main colors present in this image" 38 | description = describe_image_openai( 39 | test_image_path, 40 | prompt=custom_prompt, 41 | ) 42 | assert any( 43 | term in description.lower() 44 | for term in ["color", "green", "brown"] 45 | ), "Custom prompt was not reflected in output" 46 | 47 | @pytest.mark.ollama 48 | def test_ollama_custom_prompt(self, test_image_path): 49 | """Test Ollama with custom prompt.""" 50 | custom_prompt = "List the main colors present in this image" 51 | description = describe_image_ollama( 52 | test_image_path, 53 | prompt=custom_prompt, 54 | ) 55 | assert any( 56 | term in description.lower() 57 | for term in ["color", "green", "brown"] 58 | ), "Custom prompt was not reflected in output" 59 | -------------------------------------------------------------------------------- /tests/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MDGrey33/pyvisionai/da23f86e12e6576f5f65bf39f32f424896d853b3/tests/utils/__init__.py -------------------------------------------------------------------------------- /tests/utils/conftest.py: -------------------------------------------------------------------------------- 1 | """Test fixtures and mock classes for utils tests.""" 2 | 3 | import logging 4 | from unittest.mock import MagicMock 5 | 6 | import pytest 7 | 8 | from pyvisionai.utils.retry import ( 9 | RetryableError, 10 | RetryManager, 11 | RetryStrategy, 12 | ) 13 | 14 | 15 | class MockRetryableError(RetryableError): 16 | """Mock error that should trigger retry.""" 17 | 18 | pass 19 | 20 | 21 | @pytest.fixture 22 | def mock_logger(): 23 | """Create a mock logger for testing.""" 24 | return MagicMock(spec=logging.Logger) 25 | 26 | 27 | @pytest.fixture 28 | def retry_manager(mock_logger): 29 | """Create a RetryManager instance for testing.""" 30 | return RetryManager( 31 | max_attempts=3, 32 | strategy=RetryStrategy.EXPONENTIAL, 33 | base_delay=0.1, # Small delay for faster tests 34 | max_delay=1.0, 35 | logger=mock_logger, 36 | ) 37 | -------------------------------------------------------------------------------- /tests/utils/metrics.py: -------------------------------------------------------------------------------- 1 | """Performance metrics utilities for tests.""" 2 | 3 | 4 | def print_performance_metrics( 5 | file_type: str, 6 | method: str, 7 | setup_time: float, 8 | extraction_time: float, 9 | output_size: int, 10 | interface: str = "API", 11 | ): 12 | """Print performance metrics in a consistent format.""" 13 | total_time = setup_time + extraction_time 14 | print(f"\n{file_type.upper()} ({method}) {interface} Performance:") 15 | if interface == "API": 16 | print(f"Setup time: {setup_time:.2f}s") 17 | print(f"Extraction time: {extraction_time:.2f}s") 18 | print(f"Total time: {total_time:.2f}s") 19 | print(f"Output size: {output_size / 1024:.2f}KB") 20 | -------------------------------------------------------------------------------- /tests/utils/verifiers.py: -------------------------------------------------------------------------------- 1 | """Content verification utilities for tests.""" 2 | 3 | 4 | def verify_basic_content(content: str): 5 | """Verify basic content requirements.""" 6 | # Check that content is not empty 7 | assert len(content) > 0, "Content should not be empty" 8 | 9 | # Check that content has some meaningful text 10 | assert ( 11 | len(content.split()) > 10 12 | ), "Content should have meaningful description" 13 | 14 | # Check for markdown formatting 15 | assert ( 16 | content.startswith("#") or "Page" in content 17 | ), "Content should be properly formatted" 18 | 19 | 20 | def verify_pdf_content(content: str): 21 | """Verify PDF-specific content.""" 22 | assert ( 23 | "Page 1" in content 24 | ), "PDF content should include page numbers" 25 | assert ( 26 | "Exploring Nature" in content 27 | ), "Expected document title not found in PDF" 28 | assert ( 29 | "[Image" in content and "forest" in content.lower() 30 | ), "Expected forest description not found in PDF" 31 | 32 | 33 | def verify_pptx_content(content: str): 34 | """Verify PPTX-specific content.""" 35 | assert ( 36 | "Slide 1" in content 37 | ), "PPTX content should include slide numbers" 38 | if "[Image" in content: 39 | assert any( 40 | term in content.lower() for term in ["tablet", "person"] 41 | ), "Expected image content not found in PPTX" 42 | 43 | 44 | def verify_docx_content(content: str): 45 | """Verify DOCX-specific content.""" 46 | assert ( 47 | "Exploring Nature" in content 48 | ), "Expected document title not found in DOCX" 49 | if "[Image" in content: 50 | assert ( 51 | "forest" in content.lower() 52 | ), "Expected forest description not found in DOCX" 53 | 54 | 55 | def verify_html_content(content: str): 56 | """Verify HTML-specific content.""" 57 | assert ( 58 | "Page 1" in content 59 | ), "HTML content should include page numbers" 60 | assert ( 61 | "Exploring Nature" in content 62 | ), "Expected document title not found in HTML" 63 | assert ( 64 | "interactive" in content.lower() 65 | ), "Expected interactive elements description not found in HTML" 66 | assert ( 67 | "biodiversity" in content.lower() 68 | ), "Expected biodiversity section not found in HTML" 69 | assert ( 70 | "[Image" in content and "forest" in content.lower() 71 | ), "Expected forest description not found in HTML" 72 | 73 | 74 | # Content verification mapping 75 | content_verifiers = { 76 | "pdf": verify_pdf_content, 77 | "docx": verify_docx_content, 78 | "pptx": verify_pptx_content, 79 | "html": verify_html_content, 80 | } 81 | --------------------------------------------------------------------------------