├── .env.example
├── .gitignore
├── README.md
├── config
    └── service-account-template.json
├── docs
    ├── advanced_usage.md
    ├── architecture.md
    ├── quickstart.md
    ├── supabase_setup.md
    └── troubleshooting.md
├── examples
    └── langchain_examples.py
├── rag
    ├── __init__.py
    ├── auth.py
    ├── ingestion.py
    ├── query.py
    ├── store.py
    └── utils.py
├── requirements.txt
├── run_examples.sh
├── setup.bat
├── setup.sh
├── setup_vector_store.sql
└── utils
    ├── __init__.py
    └── display_utils.py


/.env.example:
--------------------------------------------------------------------------------
 1 | # OpenAI API (may be required)
 2 | OPENAI_API_KEY=your_openai_api_key_here
 3 | 
 4 | # Google Drive authentication
 5 | GOOGLE_APPLICATION_CREDENTIALS=./config/credentials.json
 6 | 
 7 | # Google OAuth2 Configuration (optional if using service account)
 8 | # GOOGLE_CLIENT_ID=your_client_id_here
 9 | # GOOGLE_CLIENT_SECRET=your_client_secret_here
10 | # GOOGLE_REFRESH_TOKEN=your_refresh_token_here
11 | 
12 | # Google Drive Default Folder
13 | GOOGLE_DRIVE_DEFAULT_FOLDER_ID=your_folder_id_here
14 | 
15 | # Supabase Configuration
16 | SUPABASE_ACCESS_TOKEN=your_supabase_access_token
17 | # SUPABASE_ORG_ID=your_org_id  # Uncomment if you have multiple organizations
18 | SUPABASE_PROJECT_ID=your_project_id
19 | SUPABASE_PROJECT_REF=your_project_ref
20 | SUPABASE_SERVICE_KEY=your_service_key
21 | SUPABASE_URL=your_supabase_url
22 | SUPABASE_POSTGRES_URL=postgres://postgres:your_password@db.your_project_id.supabase.co:5432/postgres
23 | 
24 | # Optional: Additional configuration
25 | # CHROMA_PERSIST_DIRECTORY=custom_chroma_directory 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Python
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | *.so
 6 | .Python
 7 | env/
 8 | build/
 9 | develop-eggs/
10 | dist/
11 | downloads/
12 | eggs/
13 | .eggs/
14 | lib/
15 | lib64/
16 | parts/
17 | sdist/
18 | var/
19 | *.egg-info/
20 | .installed.cfg
21 | *.egg
22 | 
23 | # Virtual Environment
24 | venv/
25 | ENV/
26 | env/
27 | 
28 | # Environment Variables
29 | .env
30 | 
31 | # Service Account Keys
32 | config/service-account.json
33 | 
34 | # IDE files
35 | .idea/
36 | .vscode/
37 | *.swp
38 | *.swo
39 | 
40 | # OS files
41 | .DS_Store
42 | Thumbs.db
43 | 
44 | # Logs
45 | *.log
46 | logs/
47 | 
48 | # Database
49 | *.db
50 | *.sqlite3
51 | 
52 | # Local Supabase data
53 | supabase/ 
54 | 
55 | .specstory/


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Enterprise-Grade RAG with Google Drive & Supabase
  2 | 
  3 | [![You're Doing RAG Wrong](https://gettingautomated.com/wp-content/uploads/2025/04/Youre-Doing-RAG-Wrong.jpg)](https://youtu.be/auxO_NYTexc "Watch the Tutorial")
  4 | 
  5 | A production-ready Retrieval-Augmented Generation (RAG) system featuring Google Drive integration, document versioning, and Supabase vector storage. This enterprise-grade RAG solution provides advanced document processing capabilities at a fraction of the cost of commercial API services.
  6 | 
  7 | ## ⚠️ Important Disclaimer
  8 | 
  9 | **This framework is intended as a demonstration and educational resource.** Retrieval-Augmented Generation (RAG) is not a one-size-fits-all solution, and the implementation needs to be tailored to your specific use case for optimal results.
 10 | 
 11 | To use this framework effectively with your own data:
 12 | - You will need to modify components to match your specific document types and content structure
 13 | - Chunking strategies should be adjusted based on your query patterns and document characteristics
 14 | - Embedding models may need to be changed depending on your domain-specific terminology
 15 | - Performance tuning will be required based on your scale and response time requirements
 16 | 
 17 | While this framework provides a solid foundation, successful RAG implementations require iterative testing and optimization with your actual data and use cases.
 18 | 
 19 | ## 🚀 Why Use This RAG System?
 20 | 
 21 | - **Enterprise-Ready**: Production-quality implementation with robust error handling, versioning, and comprehensive metadata
 22 | - **Cost-Effective**: Build your own RAG system without expensive API or SaaS subscription costs
 23 | - **Full Control**: Customize every aspect of your RAG pipeline to your exact requirements
 24 | - **Complete Privacy**: Your documents never leave your infrastructure - essential for sensitive data
 25 | - **Advanced Features**: Document versioning, image OCR, table extraction, and face detection capabilities
 26 | - **Seamless Integration**: Works natively with Google Drive and Supabase for simple deployment
 27 | 
 28 | ## ⚡ System Workflow
 29 | 
 30 | ```mermaid
 31 | graph TD
 32 |     A[Google Drive Documents] -->|Secure Authentication| B[Document Ingestion]
 33 |     B -->|Content Extraction| C[Document Processing]
 34 |     C -->|Text Chunking| D[Embedding Generation]
 35 |     D -->|Vector Storage| E[Supabase pgVector]
 36 |     E -->|Similarity Search| F[Query Engine]
 37 |     F -->|Relevant Content| G[RAG Application]
 38 |     
 39 |     style A fill:#ff9966,stroke:#333,stroke-width:2px
 40 |     style C fill:#6699ff,stroke:#333,stroke-width:2px
 41 |     style E fill:#66cc99,stroke:#333,stroke-width:2px
 42 |     style G fill:#9966cc,stroke:#333,stroke-width:2px
 43 | ```
 44 | 
 45 | ## Core Features
 46 | 
 47 | - **Google Drive Integration**: Secure document access with service account authentication
 48 | - **Advanced Document Versioning**: 
 49 |   - Track document changes with automatic version history
 50 |   - Skip unchanged documents to reduce processing & embedding costs
 51 |   - Distinguish between content changes vs. metadata-only updates
 52 | - **Rich Metadata Extraction**:
 53 |   - Comprehensive document metadata including access permissions
 54 |   - Direct permissions API with SSL error handling
 55 |   - Hierarchical metadata with role-based access summaries
 56 | - **Advanced Media Processing**:
 57 |   - OCR for text extraction from images
 58 |   - Table extraction from PDFs with structure preservation
 59 |   - Image analysis with face detection and visual content understanding
 60 | - **High-Performance Processing**:
 61 |   - Parallel document processing with built-in timeout protection
 62 |   - Optimized chunking with content-aware boundary detection
 63 |   - Efficient error handling with graceful degradation
 64 | - **Enterprise Vector Storage**:
 65 |   - Supabase pgvector with optimized indexes
 66 |   - Document-level versioning and change tracking
 67 |   - Metadata-rich embeddings for advanced filtering
 68 | 
 69 | ## 📊 Cost Comparison
 70 | 
 71 | Building your own RAG system with this solution can be significantly more cost-effective than using commercial services:
 72 | 
 73 | | Component | Cost Factor | Monthly Est. (10k docs) | Notes |
 74 | |-----------|-------------|-------------------------|-------|
 75 | | **Supabase** | Vector storage | $0-25/month | Free tier available for smaller collections |
 76 | | **OpenAI API** | Embeddings | $0.50-5/month | Based on ~$0.0001/1K tokens |
 77 | | **Google Drive** | Document storage | $0-2/month | Free tier available (15GB) |
 78 | | **Total** | | **$0.50-32/month** | |
 79 | 
 80 | **Compared to Commercial RAG Services:**
 81 | - Enterprise RAG solutions: $500-5,000/month (based on usage)
 82 | - Managed vector DBs: $50-500/month (for similar document volume)
 83 | - Commercial embedding APIs: $20-200/month (for similar document volume)
 84 | 
 85 | ## Architecture
 86 | 
 87 | The system follows a modular design with these key components:
 88 | 
 89 | ```mermaid
 90 | flowchart LR
 91 |     GDrive["Google Drive Documents"] --> Auth["Authentication (auth.py)"]
 92 |     Auth --> Ingestion["Document Ingestion (ingestion.py)"]
 93 |     Ingestion --> Processor["Document Processor (utils.py)"]
 94 |     Processor --> VectorStore["Supabase pgVector (store.py)"]
 95 |     VectorStore --> Query["Query Engine (query.py)"]
 96 |     Query --> Results["Search Results"]
 97 |     
 98 |     classDef core fill:#0d5794,stroke:#333,stroke-width:1px;
 99 |     classDef data fill:#b3a41d,stroke:#333,stroke-width:1px;
100 |     classDef process fill:#a934e3,stroke:#333,stroke-width:1px;
101 |     
102 |     class GDrive,Results data;
103 |     class Auth,Ingestion,Processor,VectorStore,Query core;
104 | ```
105 | 
106 | See the full [Architecture Documentation](docs/architecture.md) for more details on the system design.
107 | 
108 | ## Requirements
109 | 
110 | - Python 3.9 or higher
111 | - Supabase account (for vector storage)
112 | - OpenAI API key
113 | - Google Drive access (service account recommended)
114 | 
115 | ## Installation
116 | 
117 | ### Linux/macOS
118 | 
119 | ```bash
120 | # Clone the repository
121 | git clone https://github.com/Getting-Automated/n8n-rag-example.git
122 | cd n8n-rag-example
123 | 
124 | # Run the setup script
125 | chmod +x setup.sh
126 | ./setup.sh
127 | ```
128 | 
129 | ### Windows
130 | 
131 | ```batch
132 | # Clone the repository
133 | git clone https://github.com/Getting-Automated/n8n-rag-example.git
134 | cd n8n-rag-example
135 | 
136 | # Run the setup script
137 | setup.bat
138 | ```
139 | 
140 | ## Quick Start
141 | 
142 | Check out the [Quick Start Guide](docs/quickstart.md) for setup instructions and examples.
143 | 
144 | ## Documentation
145 | 
146 | Full documentation is available in the [docs](docs/) directory:
147 | 
148 | - [Quick Start Guide](docs/quickstart.md)
149 | - [Architecture Overview](docs/architecture.md)
150 | - [Advanced Usage](docs/advanced_usage.md)
151 | - [Supabase Setup Guide](docs/supabase_setup.md)
152 | - [Troubleshooting](docs/troubleshooting.md)
153 | 
154 | ## Examples
155 | 
156 | The repository includes comprehensive examples in the `examples/` directory:
157 | 
158 | ```bash
159 | # Activate your virtual environment
160 | source venv/bin/activate  # Linux/macOS
161 | venv\Scripts\activate     # Windows
162 | 
163 | # Run the examples
164 | python examples/langchain_examples.py
165 | ```
166 | 
167 | The examples demonstrate:
168 | 1. Basic ingestion with standard settings
169 | 2. Advanced ingestion with enhanced features
170 | 3. Specific file processing with precision targeting
171 | 4. Enhanced media processing for images and PDFs
172 | 
173 | ## Project Structure
174 | 
175 | ```
176 | n8n-rag-example/
177 | ├── config/                      # Configuration files
178 | │   └── service-account.json     # Google service account credentials (add your own)
179 | ├── docs/                        # Documentation
180 | │   ├── advanced_usage.md        # Advanced usage guide
181 | │   ├── architecture.md          # Architecture documentation
182 | │   ├── quickstart.md            # Quick start guide
183 | │   ├── supabase_setup.md        # Supabase setup instructions
184 | │   └── troubleshooting.md       # Troubleshooting guide
185 | ├── examples/                    # Example code
186 | │   └── langchain_examples.py    # Comprehensive usage examples
187 | ├── rag/                         # Core RAG module
188 | │   ├── __init__.py              # Package initialization
189 | │   ├── auth.py                  # Authentication utilities
190 | │   ├── ingestion.py             # Document ingestion logic
191 | │   ├── query.py                 # Query processing
192 | │   ├── store.py                 # Vector store management
193 | │   └── utils.py                 # Helper utilities
194 | ├── utils/                       # Utility modules
195 | │   ├── __init__.py              # Package initialization
196 | │   └── display_utils.py         # Display utilities for examples
197 | ├── setup_vector_store.sql       # SQL setup for Supabase
198 | ├── setup.sh                     # Setup script for Linux/macOS
199 | ├── setup.bat                    # Setup script for Windows
200 | ├── requirements.txt             # Python dependencies
201 | └── ARCHITECTURE.md              # High-level architecture overview
202 | ```
203 | 
204 | ## 🤝 Join the Getting Automated Community
205 | 
206 | Want to go deeper with automation and get direct support? Join our exclusive automation community!
207 | 
208 | ### What You Get from the Getting Automated Community:
209 | - **In-depth Automation Workflows**: Learn how to integrate AI into your automation processes
210 | - **Battle-Tested Templates**: Access exclusive, production-ready automation templates
211 | - **Expert Guidance**: Get direct support from automation professionals
212 | - **Early Access to Content**: Be the first to access exclusive content
213 | - **Private Support Channels**: Receive personalized support through direct chat and office hours
214 | - **Community of Serious Builders**: Connect with like-minded professionals
215 | 
216 | The community is capped at 250 members to ensure quality support and interaction.
217 | 
218 | [Join the Getting Automated Community](https://resources.gettingautomated.com/)
219 | 
220 | ## 🔗 Additional Resources
221 | 
222 | - **Website**: [Getting Automated](https://gettingautomated.com)
223 | - **YouTube Channel**: [Getting Automated YouTube](https://www.youtube.com/@hunterasneed)
224 | - **Free Workflow Automation Tools**: [Automation Tools](https://tools.gettingautomated.com)
225 | 
226 | ### Need Personalized Help?
227 | 
228 | If you need this solution built for you or want personalized guidance, you can schedule a consultation:
229 | 
230 | [Schedule a 30-Minute Connect](https://calendly.com/workflowsy/30-minute-connect)
231 | 
232 | ## 📄 License
233 | 
234 | MIT


--------------------------------------------------------------------------------
/config/service-account-template.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "type": "service_account",
 3 |   "project_id": "your-project-id",
 4 |   "private_key_id": "your-private-key-id",
 5 |   "private_key": "-----BEGIN PRIVATE KEY-----\nYOUR_PRIVATE_KEY_CONTENT_HERE\n-----END PRIVATE KEY-----\n",
 6 |   "client_email": "your-service-account@your-project-id.iam.gserviceaccount.com",
 7 |   "client_id": "your-client-id",
 8 |   "auth_uri": "https://accounts.google.com/o/oauth2/auth",
 9 |   "token_uri": "https://oauth2.googleapis.com/token",
10 |   "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
11 |   "client_x509_cert_url": "https://www.googleapis.com/robot/v1/metadata/x509/your-service-account%40your-project-id.iam.gserviceaccount.com",
12 |   "universe_domain": "googleapis.com"
13 | }


--------------------------------------------------------------------------------
/docs/advanced_usage.md:
--------------------------------------------------------------------------------
  1 | # Advanced Usage Guide
  2 | 
  3 | This guide covers advanced patterns and techniques for using the n8n RAG Example to its full potential.
  4 | 
  5 | ## Advanced Ingestion Patterns
  6 | 
  7 | ### Custom Ingestion Parameters
  8 | 
  9 | The `LangChainIngestion` class accepts various parameters to customize the ingestion process:
 10 | 
 11 | ```python
 12 | from rag.auth import GoogleDriveAuth
 13 | from rag.ingestion import LangChainIngestion
 14 | 
 15 | # Authenticate with Google Drive
 16 | auth = GoogleDriveAuth()
 17 | drive = auth.authenticate()
 18 | 
 19 | # Create ingestion with custom parameters
 20 | ingestion = LangChainIngestion(
 21 |     drive_client=drive,
 22 |     chunk_size=500,                 # Size of text chunks
 23 |     chunk_overlap=50,               # Overlap between chunks
 24 |     table_name="custom_collection", # Supabase table name
 25 |     workers=4,                      # Number of parallel workers
 26 |     use_parallel=True,              # Enable parallel processing
 27 |     timeout=120,                    # Timeout for processing each file (seconds)
 28 |     skip_unchanged=True             # Skip unchanged documents
 29 | )
 30 | 
 31 | # Process documents
 32 | docs = ingestion.process_folder("your_folder_id")
 33 | ```
 34 | 
 35 | ### Selective Document Processing
 36 | 
 37 | Process only specific file types or matching certain criteria:
 38 | 
 39 | ```python
 40 | # Process only PDF and Word documents
 41 | docs = ingestion.process_folder(
 42 |     "your_folder_id",
 43 |     mime_types=["application/pdf", "application/vnd.google-apps.document"]
 44 | )
 45 | 
 46 | # Process files with specific names
 47 | docs = ingestion.process_folder(
 48 |     "your_folder_id",
 49 |     query="name contains 'report'"
 50 | )
 51 | 
 52 | # Process only files modified after a certain date
 53 | docs = ingestion.process_folder(
 54 |     "your_folder_id",
 55 |     query="modifiedTime > '2023-10-01T00:00:00'"
 56 | )
 57 | ```
 58 | 
 59 | ### Processing Individual Files
 60 | 
 61 | Process specific files directly:
 62 | 
 63 | ```python
 64 | # Process a specific file by ID
 65 | doc = ingestion.process_file("your_file_id")
 66 | 
 67 | # Process multiple specific files
 68 | docs = ingestion.process_files(["file_id_1", "file_id_2", "file_id_3"])
 69 | ```
 70 | 
 71 | ### Metadata Filtering
 72 | 
 73 | Apply metadata filters during querying:
 74 | 
 75 | ```python
 76 | from rag.query import query_langchain
 77 | 
 78 | # Query documents with metadata filtering
 79 | results = query_langchain(
 80 |     query="renewable energy",
 81 |     table_name="langchain_docs",
 82 |     filter_metadata={
 83 |         "source_type": "pdf", 
 84 |         "department": "research"
 85 |     }
 86 | )
 87 | ```
 88 | 
 89 | ## Advanced Media Processing
 90 | 
 91 | ### OCR and Image Processing
 92 | 
 93 | The system includes advanced image processing for extracting text from images:
 94 | 
 95 | ```python
 96 | from rag.ingestion import LangChainIngestion
 97 | 
 98 | # Create ingestion with enhanced image processing
 99 | ingestion = LangChainIngestion(
100 |     enable_ocr=True,                # Enable OCR for images
101 |     ocr_min_confidence=70,          # Minimum confidence score (0-100)
102 |     preprocess_images=True,         # Apply preprocessing to images
103 |     extract_image_features=True     # Extract visual features
104 | )
105 | 
106 | # Process documents with images
107 | docs = ingestion.process_folder("your_folder_id")
108 | ```
109 | 
110 | ### PDF Table Extraction
111 | 
112 | Extract and preserve table structures from PDFs:
113 | 
114 | ```python
115 | from rag.ingestion import LangChainIngestion
116 | 
117 | # Create ingestion with table extraction enabled
118 | ingestion = LangChainIngestion(
119 |     extract_tables=True,               # Enable table extraction
120 |     table_extraction_engines=["all"],  # Use all available engines
121 |     preserve_table_structure=True      # Keep table structure in output
122 | )
123 | 
124 | # Process PDF documents
125 | docs = ingestion.process_folder("your_folder_id")
126 | ```
127 | 
128 | ## Document Versioning
129 | 
130 | The system tracks document versions to optimize processing and avoid redundant work:
131 | 
132 | ```python
133 | from rag.ingestion import LangChainIngestion
134 | 
135 | # Create ingestion with versioning options
136 | ingestion = LangChainIngestion(
137 |     skip_unchanged=True,              # Skip unchanged documents
138 |     force_refresh=False,              # Don't force refresh if unchanged
139 |     track_content_changes=True        # Track if content actually changed
140 | )
141 | 
142 | # Process documents with versioning
143 | docs = ingestion.process_folder("your_folder_id")
144 | 
145 | # Force refresh of all documents regardless of changes
146 | docs_refreshed = ingestion.process_folder(
147 |     "your_folder_id",
148 |     force_refresh=True
149 | )
150 | ```
151 | 
152 | ## Parallel Processing
153 | 
154 | Speed up ingestion with parallel processing:
155 | 
156 | ```python
157 | from rag.ingestion import LangChainIngestion
158 | 
159 | # Set up parallel processing
160 | ingestion = LangChainIngestion(
161 |     use_parallel=True,        # Enable parallel processing
162 |     workers=8,                # Number of worker processes
163 |     batch_size=10,            # Number of documents per batch
164 |     timeout=180               # Processing timeout per document (seconds)
165 | )
166 | 
167 | # Process a large folder with parallel workers
168 | docs = ingestion.process_folder("your_large_folder_id")
169 | ```
170 | 
171 | ## Advanced Querying
172 | 
173 | ### Similarity Search with Metadata Filtering
174 | 
175 | Combine vector similarity with metadata filtering:
176 | 
177 | ```python
178 | from rag.query import query_langchain
179 | 
180 | # Basic query with metadata filtering
181 | results = query_langchain(
182 |     query="climate change impact",
183 |     table_name="langchain_docs",
184 |     top_k=5,
185 |     filter_metadata={"document_type": "research_paper"}
186 | )
187 | 
188 | # More complex metadata filtering
189 | results = query_langchain(
190 |     query="renewable energy technology",
191 |     table_name="langchain_docs", 
192 |     top_k=10,
193 |     filter_metadata={
194 |         "year": "2023",
195 |         "department": "research",
196 |         "status": "published"
197 |     }
198 | )
199 | ```
200 | 
201 | ### Custom Embedding Options
202 | 
203 | Use different embedding models or parameters:
204 | 
205 | ```python
206 | from rag.query import query_langchain
207 | from langchain_openai import OpenAIEmbeddings
208 | 
209 | # Create custom embeddings
210 | custom_embeddings = OpenAIEmbeddings(
211 |     model="text-embedding-3-large",
212 |     dimensions=1536
213 | )
214 | 
215 | # Query with custom embeddings
216 | results = query_langchain(
217 |     query="future of renewable energy",
218 |     table_name="langchain_docs",
219 |     embedding_function=custom_embeddings
220 | )
221 | ```
222 | 
223 | ## Performance Optimization
224 | 
225 | ### Optimizing Chunk Size
226 | 
227 | Finding the right chunk size is essential for good retrieval performance:
228 | 
229 | ```python
230 | from rag.ingestion import LangChainIngestion
231 | 
232 | # Smaller chunks for precise retrieval (better for Q&A)
233 | ingestion_precise = LangChainIngestion(
234 |     chunk_size=300,
235 |     chunk_overlap=30
236 | )
237 | 
238 | # Larger chunks for more context (better for summarization)
239 | ingestion_context = LangChainIngestion(
240 |     chunk_size=1000,
241 |     chunk_overlap=100
242 | )
243 | ```
244 | 
245 | ### Batch Processing for Large Collections
246 | 
247 | Process large document collections in batches:
248 | 
249 | ```python
250 | from rag.auth import GoogleDriveAuth
251 | from rag.ingestion import LangChainIngestion
252 | 
253 | # Authenticate with Google Drive
254 | auth = GoogleDriveAuth()
255 | drive = auth.authenticate()
256 | 
257 | # Create ingestion instance
258 | ingestion = LangChainIngestion(drive_client=drive)
259 | 
260 | # Get all files in the folder
261 | folder_id = "your_folder_id"
262 | files = ingestion.list_files_in_folder(folder_id, recursive=True)
263 | 
264 | # Process in batches
265 | batch_size = 50
266 | for i in range(0, len(files), batch_size):
267 |     batch = files[i:i+batch_size]
268 |     print(f"Processing batch {i//batch_size + 1}/{(len(files) + batch_size - 1)//batch_size}")
269 |     ingestion.process_files([file['id'] for file in batch])
270 | ```
271 | 
272 | ## Robust Error Handling
273 | 
274 | The system includes built-in error handling for various scenarios:
275 | 
276 | ```python
277 | from rag.ingestion import LangChainIngestion
278 | 
279 | # Configure error handling
280 | ingestion = LangChainIngestion(
281 |     continue_on_error=True,        # Continue processing despite errors
282 |     error_log_file="errors.log",   # Log errors to a file
283 |     retry_count=3,                 # Number of retries for failed operations
284 |     retry_delay=2                  # Delay between retries (seconds)
285 | )
286 | 
287 | # Process folder with error handling
288 | try:
289 |     docs = ingestion.process_folder("your_folder_id")
290 | except Exception as e:
291 |     print(f"Ingestion failed with error: {e}")
292 |     # Handle error gracefully
293 | ```
294 | 
295 | ## Integration with Custom Applications
296 | 
297 | ### Building a Custom RAG Solution
298 | 
299 | Integrate the n8n RAG Example into your own applications:
300 | 
301 | ```python
302 | from rag.auth import GoogleDriveAuth
303 | from rag.ingestion import LangChainIngestion
304 | from rag.query import query_langchain
305 | from langchain_openai import ChatOpenAI
306 | 
307 | # 1. Set up authentication
308 | auth = GoogleDriveAuth()
309 | drive = auth.authenticate()
310 | 
311 | # 2. Create ingestion
312 | ingestion = LangChainIngestion(drive_client=drive)
313 | 
314 | # 3. Process documents
315 | docs = ingestion.process_folder("your_folder_id")
316 | 
317 | # 4. Set up query function
318 | def answer_question(question):
319 |     # Retrieve relevant documents
320 |     relevant_docs = query_langchain(
321 |         query=question,
322 |         table_name="langchain_docs",
323 |         top_k=5
324 |     )
325 |     
326 |     # Format context from retrieved documents
327 |     context = "\n\n".join([doc.page_content for doc in relevant_docs])
328 |     
329 |     # Create LLM
330 |     llm = ChatOpenAI(model="gpt-4")
331 |     
332 |     # Generate response
333 |     prompt = f"""
334 |     Answer the following question based on the provided context:
335 |     
336 |     Context:
337 |     {context}
338 |     
339 |     Question: {question}
340 |     """
341 |     
342 |     response = llm.invoke(prompt)
343 |     return response.content
344 | 
345 | # 5. Use in your application
346 | answer = answer_question("What are the key benefits of renewable energy?")
347 | print(answer)
348 | ```
349 | 
350 | ### Web API Integration
351 | 
352 | Example of creating a simple FastAPI endpoint:
353 | 
354 | ```python
355 | from fastapi import FastAPI, Query
356 | from pydantic import BaseModel
357 | from rag.query import query_langchain
358 | 
359 | app = FastAPI()
360 | 
361 | class QueryRequest(BaseModel):
362 |     query: str
363 |     top_k: int = 5
364 |     filter_metadata: dict = None
365 | 
366 | class Document(BaseModel):
367 |     content: str
368 |     metadata: dict
369 | 
370 | class QueryResponse(BaseModel):
371 |     documents: list[Document]
372 | 
373 | @app.post("/query", response_model=QueryResponse)
374 | async def query_documents(request: QueryRequest):
375 |     results = query_langchain(
376 |         query=request.query,
377 |         table_name="langchain_docs",
378 |         top_k=request.top_k,
379 |         filter_metadata=request.filter_metadata
380 |     )
381 |     
382 |     return {
383 |         "documents": [
384 |             {"content": doc.page_content, "metadata": doc.metadata}
385 |             for doc in results
386 |         ]
387 |     }
388 | ```
389 | 
390 | ## Best Practices
391 | 
392 | ### Document Organization
393 | 
394 | - Organize your Google Drive folders logically by topic or department
395 | - Use consistent naming conventions for files
396 | - Consider creating separate vector stores for different document collections
397 | - Process related documents together to maintain proper context
398 | 
399 | ### Optimizing Retrieval Quality
400 | 
401 | - Use appropriate chunk sizes for your use case:
402 |   - Smaller chunks (300-500 tokens) for precise Q&A
403 |   - Larger chunks (1000+ tokens) for summarization and broader context
404 | - Maintain reasonable chunk overlap (10-20% of chunk size)
405 | - Experiment with different metadata filters to improve relevance
406 | - Periodically refresh your document embeddings to keep them current
407 | 
408 | ### Security Considerations
409 | 
410 | - Use service accounts with minimal necessary permissions
411 | - Store all credentials securely in environment variables
412 | - Implement proper authentication for any web APIs you create
413 | - Consider implementing document-level access control based on metadata
414 | - Regularly rotate API keys and credentials 


--------------------------------------------------------------------------------
/docs/architecture.md:
--------------------------------------------------------------------------------
  1 | # Architecture Overview
  2 | 
  3 | This document provides a detailed explanation of the n8n RAG Example architecture, including components, data flow, and design principles.
  4 | 
  5 | ## System Architecture
  6 | 
  7 | ```mermaid
  8 | flowchart LR
  9 |     GDrive["Google Drive Documents"] --> Auth["Authentication (auth.py)"]
 10 |     Auth --> Ingestion["Document Ingestion (ingestion.py)"]
 11 |     Ingestion --> Processor["Document Processor (utils.py)"]
 12 |     Processor --> VectorStore["Supabase pgVector (store.py)"]
 13 |     VectorStore --> Query["Query Engine (query.py)"]
 14 |     Query --> Results["Search Results"]
 15 |     
 16 |     classDef core fill:#0d5794,stroke:#333,stroke-width:1px;
 17 |     classDef data fill:#b3a41d,stroke:#333,stroke-width:1px;
 18 |     classDef process fill:#a934e3,stroke:#333,stroke-width:1px;
 19 |     
 20 |     class GDrive,Results data;
 21 |     class Auth,Ingestion,Processor,VectorStore,Query core;
 22 | ```
 23 | 
 24 | ## Core Components
 25 | 
 26 | ### 1. Authentication Module (`rag/auth.py`)
 27 | 
 28 | The authentication module handles Google Drive authentication:
 29 | 
 30 | - Supports service account authentication (recommended for backend use)
 31 | - Supports OAuth2 authentication for accessing personal user content
 32 | - Manages token refresh and credential handling
 33 | 
 34 | ### 2. Document Ingestion (`rag/ingestion.py`)
 35 | 
 36 | The ingestion module is responsible for retrieving and processing documents:
 37 | 
 38 | - Retrieves documents from Google Drive
 39 | - Extracts text content and metadata
 40 | - Implements document versioning and change tracking
 41 | - Handles parallel processing with timeout protection
 42 | - Provides direct permissions API with robust SSL error handling
 43 | 
 44 | ### 3. Document Processing (`rag/utils.py`)
 45 | 
 46 | The utilities module provides document processing functionality:
 47 | 
 48 | - Advanced text extraction from various document types
 49 | - Content-aware chunking with boundary preservation
 50 | - OCR for image text extraction
 51 | - Table extraction from PDFs
 52 | - Image analysis with computer vision techniques
 53 | 
 54 | ### 4. Vector Storage (`rag/store.py`)
 55 | 
 56 | The store module manages vector database operations:
 57 | 
 58 | - Handles connections to Supabase pgvector
 59 | - Provides efficient embedding storage and retrieval
 60 | - Supports metadata-rich vector storage
 61 | - Implements optimized vector indexes for search
 62 | 
 63 | ### 5. Query Engine (`rag/query.py`)
 64 | 
 65 | The query engine provides retrieval capabilities:
 66 | 
 67 | - Performs vector similarity search
 68 | - Supports metadata filtering
 69 | - Retrieves relevant document chunks based on queries
 70 | - Formats results for consumption
 71 | 
 72 | ## Data Flow
 73 | 
 74 | ### Ingestion Flow
 75 | 
 76 | ```mermaid
 77 | sequenceDiagram
 78 |     participant GD as Google Drive
 79 |     participant Auth as Authentication
 80 |     participant Ingest as LangChainIngestion
 81 |     participant Process as DocumentProcessor
 82 |     participant VS as Supabase Vector Store
 83 |     
 84 |     GD->>Auth: Authenticate (Service Account/OAuth)
 85 |     Auth->>Ingest: Retrieve document list
 86 |     Ingest->>GD: Request document content
 87 |     GD->>Ingest: Return documents & metadata
 88 |     Ingest->>Process: Process documents in parallel
 89 |     Process->>Process: 1. Extract text & metadata
 90 |     Process->>Process: 2. Check document version
 91 |     Process->>Process: 3. Skip unchanged documents
 92 |     Process->>Process: 4. Split into chunks
 93 |     Process->>VS: Generate embeddings & store
 94 | ```
 95 | 
 96 | ### Query Flow
 97 | 
 98 | ```mermaid
 99 | sequenceDiagram
100 |     participant User
101 |     participant Query as Query Engine
102 |     participant VS as Supabase Vector Store
103 |     participant Results as Search Results
104 |     
105 |     User->>Query: Submit query
106 |     Query->>Query: Generate query embedding
107 |     Query->>VS: Perform similarity search
108 |     VS->>Query: Return relevant chunks
109 |     Query->>Results: Format and deliver results
110 |     Results->>User: Display results
111 | ```
112 | 
113 | ## Key Features
114 | 
115 | ### Document Versioning
116 | 
117 | The system tracks document versions to optimize processing and storage:
118 | 
119 | ```mermaid
120 | flowchart TD
121 |     A[Document] --> B{Changed Since Last Processed?}
122 |     B -->|No| C[Skip Processing]
123 |     B -->|Yes| D{Content Changed?}
124 |     D -->|No| E[Update Metadata Only]
125 |     D -->|Yes| F[Process & Re-embed]
126 |     F --> G[Update Version History]
127 |     E --> G
128 | ```
129 | 
130 | ### Advanced Media Processing
131 | 
132 | The system includes multi-modal processing capabilities:
133 | 
134 | ```mermaid
135 | flowchart TD
136 |     A[Image] -->|OCR Pipeline| B[Preprocess Image]
137 |     B --> C[Apply Tesseract OCR]
138 |     C --> D[Extract Text & Confidence]
139 |     
140 |     E[PDF] -->|Table Extraction| F[Try tabula-py]
141 |     F -->|Success| G[Extract Tables]
142 |     F -->|Failure| H[Try camelot-py]
143 |     H --> G
144 |     
145 |     I[Image] -->|Computer Vision| J[Face Detection]
146 |     I -->|Image Analysis| K[Color & Edge Analysis]
147 |     J --> L[Generate Description]
148 |     K --> L
149 | ```
150 | 
151 | ### Permissions Handling
152 | 
153 | Robust permissions handling for Google Drive documents:
154 | 
155 | ```mermaid
156 | flowchart LR
157 |     A[Document Processing] --> B{Connection Issues?}
158 |     B -->|SSL Error| C[Unverified SSL Context]
159 |     B -->|No Error| D[Standard Connection]
160 |     C --> E[Short Timeout]
161 |     D --> F[Fetch Permissions]
162 |     E --> F
163 |     F --> G[Permissions API]
164 |     G --> H{API Success?}
165 |     H -->|Yes| I[Store Permissions]
166 |     H -->|No| J[Graceful Fallback]
167 |     I --> K[Permissions in Metadata]
168 |     J --> K
169 | ```
170 | 
171 | ### Content-Aware Chunking
172 | 
173 | Advanced chunking strategies for optimal document splitting:
174 | 
175 | ```mermaid
176 | flowchart TD
177 |     A[Document Text] --> B[Recursive Character Splitter]
178 |     B --> C{Content-Aware Splitting?}
179 |     C -->|Yes| D[Respect Paragraph Boundaries]
180 |     C -->|No| E[Fixed-Size Chunks]
181 |     D --> F[Chunk with Overlap]
182 |     E --> F
183 |     F --> G[Generate Embeddings]
184 |     G --> H[Store in Vector DB]
185 | ```
186 | 
187 | ## Database Schema
188 | 
189 | ### Supabase pgvector Schema
190 | 
191 | ```sql
192 | -- Main documents table
193 | CREATE TABLE langchain_docs (
194 |   id TEXT PRIMARY KEY,
195 |   content TEXT,
196 |   metadata JSONB,
197 |   embedding VECTOR(1536)
198 | );
199 | 
200 | -- Example table for testing
201 | CREATE TABLE langchain_example (
202 |   id TEXT PRIMARY KEY,
203 |   content TEXT,
204 |   metadata JSONB,
205 |   embedding VECTOR(1536)
206 | );
207 | 
208 | -- Indexes for efficient vector search
209 | CREATE INDEX ON langchain_docs USING ivfflat (embedding vector_cosine_ops) WITH (lists = 100);
210 | CREATE INDEX IF NOT EXISTS hnsw_index ON langchain_docs USING hnsw (embedding vector_cosine_ops);
211 | ```
212 | 
213 | ## Design Principles
214 | 
215 | ### 1. Modularity
216 | 
217 | The system is designed with modularity in mind:
218 | - Clear separation of concerns between components
219 | - Well-defined interfaces between modules
220 | - Pluggable components that can be extended or replaced
221 | 
222 | ### 2. Performance Optimization
223 | 
224 | Key performance features include:
225 | - Parallel document processing
226 | - Optimized vector search with specialized indexes
227 | - Skip processing for unchanged documents
228 | - Timeout protection for problematic files
229 | 
230 | ### 3. Reliability
231 | 
232 | The system prioritizes reliability:
233 | - Robust error handling
234 | - Graceful degradation when subsystems fail
235 | - Comprehensive logging
236 | - SSL error recovery
237 | 
238 | ### 4. Rich Metadata
239 | 
240 | The system captures and utilizes rich metadata:
241 | - Document-level versioning
242 | - Comprehensive permission information
243 | - Detailed content analysis
244 | - Source tracking and provenance
245 | 
246 | ## Project Structure
247 | 
248 | ```
249 | n8n-rag-example/
250 | ├── config/                      # Configuration files
251 | │   └── service-account.json     # Google service account credentials
252 | ├── docs/                        # Documentation
253 | │   ├── advanced_usage.md        # Advanced usage guide
254 | │   ├── architecture.md          # Architecture documentation
255 | │   ├── quickstart.md            # Quick start guide
256 | │   ├── supabase_setup.md        # Supabase setup instructions
257 | │   └── troubleshooting.md       # Troubleshooting guide
258 | ├── examples/                    # Example code
259 | │   └── langchain_examples.py    # Comprehensive usage examples
260 | ├── rag/                         # Core RAG module
261 | │   ├── __init__.py              # Package initialization
262 | │   ├── auth.py                  # Authentication utilities
263 | │   ├── ingestion.py             # Document ingestion logic
264 | │   ├── query.py                 # Query processing
265 | │   ├── store.py                 # Vector store management
266 | │   └── utils.py                 # Helper utilities
267 | ├── utils/                       # Utility modules
268 | │   └── display_utils.py         # Display utilities for examples
269 | ├── setup_vector_store.sql       # SQL setup for Supabase
270 | ├── setup.sh                     # Setup script for Linux/macOS
271 | ├── setup.bat                    # Setup script for Windows
272 | └── requirements.txt             # Python dependencies
273 | ``` 


--------------------------------------------------------------------------------
/docs/quickstart.md:
--------------------------------------------------------------------------------
  1 | # Quick Start Guide
  2 | 
  3 | This guide will help you get started with the n8n RAG Example project quickly.
  4 | 
  5 | ## Prerequisites
  6 | 
  7 | - Python 3.9+
  8 | - OpenAI API key
  9 | - Supabase account (for vector storage)
 10 | - Google Drive access with service account (recommended) or OAuth2
 11 | 
 12 | ## Five-Minute Setup
 13 | 
 14 | 1. **Clone the repository:**
 15 | 
 16 | ```bash
 17 | git clone https://github.com/Getting-Automated/n8n-rag-example.git
 18 | cd n8n-rag-example
 19 | ```
 20 | 
 21 | 2. **Run the setup script:**
 22 | 
 23 | For Mac/Linux:
 24 | ```bash
 25 | chmod +x setup.sh
 26 | ./setup.sh
 27 | ```
 28 | 
 29 | For Windows:
 30 | ```bash
 31 | setup.bat
 32 | ```
 33 | 
 34 | 3. **Configure your environment:**
 35 | 
 36 | Copy the example file and fill in your credentials:
 37 | ```bash
 38 | cp .env.example .env
 39 | ```
 40 | 
 41 | Edit the `.env` file with your credentials:
 42 | ```
 43 | # OpenAI API Key
 44 | OPENAI_API_KEY=your_openai_api_key_here
 45 | 
 46 | # Supabase Configuration
 47 | SUPABASE_URL=your_supabase_url_here
 48 | SUPABASE_API_KEY=your_supabase_api_key_here
 49 | 
 50 | # Google Drive Configuration - Service Account (recommended)
 51 | GOOGLE_SERVICE_ACCOUNT_PATH=config/service-account.json
 52 | 
 53 | # Alternatively, for OAuth2 (personal account access)
 54 | # GOOGLE_CLIENT_ID=your_client_id_here
 55 | # GOOGLE_CLIENT_SECRET=your_client_secret_here
 56 | # GOOGLE_REFRESH_TOKEN=your_refresh_token_here
 57 | ```
 58 | 
 59 | 4. **Set up Google Drive access:**
 60 | 
 61 | #### Service Account Method (Recommended)
 62 | 1. Go to the [Google Cloud Console](https://console.cloud.google.com/)
 63 | 2. Create a new project or select an existing one
 64 | 3. Navigate to "APIs & Services" > "Library" and enable "Google Drive API"
 65 | 4. Go to "APIs & Services" > "Credentials"
 66 | 5. Click "Create Credentials" > "Service Account"
 67 | 6. Fill in service account details and click "Create"
 68 | 7. On the service account page, go to "Keys" tab
 69 | 8. Add a new JSON key and download it
 70 | 9. Save the JSON file to `config/service-account.json`
 71 | 10. Share your Google Drive folders with the service account email
 72 | 
 73 | 5. **Set up Supabase vector database:**
 74 | 
 75 | 1. Create a Supabase account at [supabase.com](https://supabase.com/) if you don't have one
 76 | 2. Create a new project in the Supabase dashboard
 77 | 3. Copy your project URL and API key to the `.env` file
 78 | 4. Run our setup script to create the necessary tables and indexes:
 79 | 
 80 | ```bash
 81 | python -c "import setup_vector_store; setup_vector_store.setup()"
 82 | ```
 83 | 
 84 | ## Run Your First RAG Example
 85 | 
 86 | ```bash
 87 | # Activate your virtual environment
 88 | source venv/bin/activate  # Linux/macOS
 89 | venv\Scripts\activate     # Windows
 90 | 
 91 | # Run the examples script
 92 | python examples/langchain_examples.py
 93 | ```
 94 | 
 95 | You'll see a menu of options:
 96 | 1. Basic ingestion - Process all folders with standard settings
 97 | 2. Advanced ingestion - Process with optimized parameters and enhanced features
 98 | 3. Specific files - Target individual documents with precision processing
 99 | 4. Enhanced media - Process images and PDFs with OCR, table extraction, and image analysis
100 | 
101 | ## Basic Usage
102 | 
103 | ### 1. Ingest documents from Google Drive
104 | 
105 | ```python
106 | from rag.auth import GoogleDriveAuth
107 | from rag.ingestion import LangChainIngestion
108 | 
109 | # Authenticate with Google Drive
110 | auth = GoogleDriveAuth()
111 | drive = auth.authenticate()
112 | 
113 | # Create an ingestion instance
114 | ingestion = LangChainIngestion(drive_client=drive)
115 | 
116 | # Process documents from a Google Drive folder
117 | folder_id = "your_google_drive_folder_id"
118 | documents = ingestion.process_folder(folder_id)
119 | ```
120 | 
121 | ### 2. Query your documents
122 | 
123 | ```python
124 | from rag.query import query_langchain
125 | 
126 | # Perform a similarity search
127 | results = query_langchain(
128 |     query="What is RAG?",
129 |     table_name="langchain_docs",
130 |     top_k=5
131 | )
132 | 
133 | # Display results
134 | for doc in results:
135 |     print(f"Content: {doc.page_content}")
136 |     print(f"Metadata: {doc.metadata}")
137 |     print("---")
138 | ```
139 | 
140 | ## Next Steps
141 | 
142 | After completing this quickstart guide, you can:
143 | 
144 | - Try advanced ingestion with custom parameters (see [Advanced Usage](advanced_usage.md))
145 | - Explore the different processing options for various document types
146 | - Set up custom embedding models and vector storage configurations
147 | - Implement your own RAG application using our query engine
148 | 
149 | For detailed information on system architecture, refer to the [Architecture Overview](architecture.md).
150 | 
151 | ## Directory Structure
152 | 
153 | The repository is organized as follows:
154 | 
155 | ```
156 | .
157 | ├── docs/                   # Documentation
158 | │   ├── architecture.md     # System architecture documentation
159 | │   ├── advanced_usage.md   # Advanced usage patterns
160 | │   ├── quickstart.md       # Quickstart guide (this file)
161 | │   ├── supabase_setup.md   # Supabase setup instructions
162 | │   └── troubleshooting.md  # Common issues and solutions
163 | ├── examples/               # Example code
164 | │   ├── langchain_examples.py
165 | │   ├── llamaindex_examples.py
166 | │   └── query_examples.py
167 | ├── setup/                  # Setup scripts
168 | │   ├── database/           # Database setup scripts
169 | │   │   ├── setup_supabase.py
170 | │   │   ├── setup_supabase.sh
171 | │   │   └── setup_supabase.bat
172 | │   └── environment/        # Environment setup scripts
173 | │       ├── setup.sh
174 | │       └── setup.bat
175 | ├── src/                    # Source code
176 | │   ├── auth/               # Authentication modules
177 | │   ├── ingestion/          # Ingestion modules
178 | │   ├── query/              # Query modules
179 | │   └── utils/              # Utility modules
180 | ```
181 | 
182 | ## Common Issues
183 | 
184 | - **Authentication errors**: Ensure your Google credentials are correct and have the necessary permissions
185 | - **Import errors**: Make sure you've activated the virtual environment
186 | - **Missing packages**: Run pip install -r requirements.txt 


--------------------------------------------------------------------------------
/docs/supabase_setup.md:
--------------------------------------------------------------------------------
  1 | # Supabase Vector Storage Setup Guide
  2 | 
  3 | This guide provides detailed instructions for setting up Supabase as your vector database for the n8n RAG Example project.
  4 | 
  5 | ## Overview
  6 | 
  7 | Supabase provides a scalable, cloud-hosted PostgreSQL database with pgvector extension, making it perfect for production RAG applications. The setup involves:
  8 | 
  9 | 1. Creating a Supabase project
 10 | 2. Enabling the pgvector extension
 11 | 3. Creating tables for document storage
 12 | 4. Setting up similarity search functions
 13 | 
 14 | ## Getting Started
 15 | 
 16 | ### 1. Create a Supabase Project
 17 | 
 18 | 1. Go to [Supabase.com](https://supabase.com/) and sign up/login
 19 | 2. Create a new project and set a secure database password
 20 | 3. Note your project URL and API key (found under Project Settings > API)
 21 | 4. Add these credentials to your `.env` file:
 22 |    ```
 23 |    SUPABASE_URL=https://your-project-id.supabase.co
 24 |    SUPABASE_API_KEY=your-supabase-api-key
 25 |    ```
 26 | 
 27 | ### 2. Setting Up pgvector
 28 | 
 29 | #### Automated Setup (Recommended)
 30 | 
 31 | The easiest way to set up your vector database is to use our provided SQL script:
 32 | 
 33 | 1. Go to the SQL Editor in your Supabase dashboard
 34 | 2. Upload the provided `setup_vector_store.sql` file or copy-paste its contents
 35 | 3. Run the script to create all necessary tables and functions
 36 | 
 37 | Alternatively, you can run our Python setup utility:
 38 | 
 39 | ```bash
 40 | # Run the included setup script
 41 | python -c "import setup_vector_store; setup_vector_store.setup()"
 42 | ```
 43 | 
 44 | #### Manual Setup
 45 | 
 46 | If you prefer to set up your vector database manually, follow these steps:
 47 | 
 48 | 1. Enable the pgvector extension in the SQL Editor:
 49 |    ```sql
 50 |    CREATE EXTENSION IF NOT EXISTS vector;
 51 |    ```
 52 | 
 53 | 2. Create the basic vector tables:
 54 |    ```sql
 55 |    -- Main documents table
 56 |    CREATE TABLE langchain_docs (
 57 |      id TEXT PRIMARY KEY,
 58 |      content TEXT,
 59 |      metadata JSONB,
 60 |      embedding VECTOR(1536)
 61 |    );
 62 | 
 63 |    -- Example table for testing
 64 |    CREATE TABLE langchain_example (
 65 |      id TEXT PRIMARY KEY,
 66 |      content TEXT,
 67 |      metadata JSONB,
 68 |      embedding VECTOR(1536)
 69 |    );
 70 |    ```
 71 | 
 72 | 3. Create vector similarity search indexes:
 73 |    ```sql
 74 |    -- IVF index (good balance of speed and accuracy)
 75 |    CREATE INDEX ON langchain_docs USING ivfflat (embedding vector_cosine_ops) WITH (lists = 100);
 76 |    
 77 |    -- HNSW index (faster for larger datasets)
 78 |    CREATE INDEX IF NOT EXISTS hnsw_index ON langchain_docs USING hnsw (embedding vector_cosine_ops);
 79 |    ```
 80 | 
 81 | ## Table Structure
 82 | 
 83 | The Supabase setup creates the following tables:
 84 | 
 85 | | Table Name | Purpose |
 86 | |------------|---------|
 87 | | `langchain_docs` | Primary storage for your document embeddings |
 88 | | `langchain_example` | For testing and example purposes |
 89 | 
 90 | Each table has the following structure:
 91 | 
 92 | | Column | Type | Description |
 93 | |--------|------|-------------|
 94 | | `id` | TEXT | Unique identifier for each document chunk |
 95 | | `content` | TEXT | The text content of the document chunk |
 96 | | `metadata` | JSONB | Metadata about the document (source, filename, etc.) |
 97 | | `embedding` | VECTOR(1536) | The OpenAI embedding vector (1536 dimensions) |
 98 | 
 99 | ## Understanding Vector Indexes
100 | 
101 | The setup creates two types of indexes for efficient similarity search:
102 | 
103 | ### 1. IVFFlat Index
104 | 
105 | ```sql
106 | CREATE INDEX ON langchain_docs USING ivfflat (embedding vector_cosine_ops) WITH (lists = 100);
107 | ```
108 | 
109 | - **Purpose**: General-purpose index with good balance of search speed and accuracy
110 | - **How it works**: Partitions vectors into lists for faster search
111 | - **Best for**: Medium-sized collections (thousands to tens of thousands of documents)
112 | 
113 | ### 2. HNSW Index
114 | 
115 | ```sql
116 | CREATE INDEX IF NOT EXISTS hnsw_index ON langchain_docs USING hnsw (embedding vector_cosine_ops);
117 | ```
118 | 
119 | - **Purpose**: High-performance approximate nearest neighbor search
120 | - **How it works**: Creates a hierarchical graph structure for efficient navigation
121 | - **Best for**: Large collections and performance-critical applications
122 | 
123 | ## Using with the RAG System
124 | 
125 | Once your Supabase vector database is set up, the n8n RAG Example will automatically use it for document storage and retrieval:
126 | 
127 | ```python
128 | from rag.auth import GoogleDriveAuth
129 | from rag.ingestion import LangChainIngestion
130 | 
131 | # Authenticate with Google Drive
132 | auth = GoogleDriveAuth()
133 | drive = auth.authenticate()
134 | 
135 | # Process documents from Google Drive
136 | ingestion = LangChainIngestion(drive_client=drive)
137 | 
138 | # Documents are automatically stored in Supabase
139 | documents = ingestion.process_folder("your_folder_id")
140 | ```
141 | 
142 | ## Performance Optimization
143 | 
144 | For production workloads with large document collections, consider these optimizations:
145 | 
146 | 1. **Increase lists parameter** for IVFFlat index with larger datasets:
147 |    ```sql
148 |    CREATE INDEX ON langchain_docs USING ivfflat (embedding vector_cosine_ops) WITH (lists = 1000);
149 |    ```
150 | 
151 | 2. **Tune HNSW parameters** for better performance:
152 |    ```sql
153 |    CREATE INDEX hnsw_index ON langchain_docs 
154 |    USING hnsw (embedding vector_cosine_ops) 
155 |    WITH (m = 16, ef_construction = 128);
156 |    ```
157 | 
158 | 3. **Create a custom function** for filtered searches:
159 |    ```sql
160 |    CREATE OR REPLACE FUNCTION match_documents_filtered (
161 |      query_embedding VECTOR(1536),
162 |      filter_condition TEXT,
163 |      match_count INT DEFAULT 5
164 |    ) RETURNS TABLE (
165 |      id TEXT,
166 |      content TEXT,
167 |      metadata JSONB,
168 |      similarity FLOAT
169 |    ) LANGUAGE plpgsql AS $$
170 |    BEGIN
171 |      RETURN QUERY EXECUTE 
172 |      format('SELECT id, content, metadata, 1 - (embedding <=> %L::vector) AS similarity
173 |              FROM langchain_docs
174 |              WHERE %s
175 |              ORDER BY similarity DESC
176 |              LIMIT %s',
177 |              query_embedding, filter_condition, match_count);
178 |    END;
179 |    $$;
180 |    ```
181 | 
182 | ## Troubleshooting
183 | 
184 | ### Common Issues and Solutions
185 | 
186 | - **"Extension 'vector' does not exist"**: Ensure you've created the extension with `CREATE EXTENSION IF NOT EXISTS vector;`
187 | 
188 | - **"Relation does not exist"**: Tables weren't created properly. Check for errors in the SQL script execution.
189 | 
190 | - **Slow query performance**: Make sure you've created the proper indexes for your workload. For large collections, use the HNSW index.
191 | 
192 | - **Connection errors**: Verify your Supabase URL and API key in the `.env` file. Check that your IP is not restricted in Supabase settings.
193 | 
194 | - **Permission errors**: Ensure your service key has the necessary permissions. For testing, you can use the `anon` key, but for production, create a custom API key with appropriate permissions.
195 | 
196 | ### Getting Additional Help
197 | 
198 | If you're experiencing issues with Supabase:
199 | 
200 | 1. Check the [Supabase documentation](https://supabase.com/docs) for general guidance
201 | 2. Review the [pgvector documentation](https://github.com/pgvector/pgvector) for vector-specific issues
202 | 3. See our [Troubleshooting Guide](troubleshooting.md) for more project-specific help 


--------------------------------------------------------------------------------
/docs/troubleshooting.md:
--------------------------------------------------------------------------------
  1 | # Troubleshooting Guide
  2 | 
  3 | This guide addresses common issues you might encounter when using the n8n RAG Example project.
  4 | 
  5 | ## Installation Issues
  6 | 
  7 | ### Python Environment Problems
  8 | 
  9 | **Issue**: `ModuleNotFoundError: No module named '<package_name>'`  
 10 | **Solution**: Ensure your virtual environment is activated and reinstall dependencies:
 11 | ```bash
 12 | # Linux/macOS
 13 | source venv/bin/activate
 14 | pip install -r requirements.txt
 15 | 
 16 | # Windows
 17 | venv\Scripts\activate
 18 | pip install -r requirements.txt
 19 | ```
 20 | 
 21 | **Issue**: Python version compatibility errors  
 22 | **Solution**: Ensure you're using Python 3.9 or higher:
 23 | ```bash
 24 | python --version
 25 | ```
 26 | 
 27 | ### Virtual Environment Problems
 28 | 
 29 | **Issue**: Virtual environment not working correctly  
 30 | **Solution**: Try recreating it:
 31 | ```bash
 32 | # Linux/macOS
 33 | rm -rf venv
 34 | ./setup.sh
 35 | 
 36 | # Windows
 37 | rmdir /s /q venv
 38 | setup.bat
 39 | ```
 40 | 
 41 | ### Missing Dependencies for Advanced Features
 42 | 
 43 | **Issue**: Errors related to OCR, table extraction, or computer vision features  
 44 | **Solution**: Install the required external dependencies:
 45 | 
 46 | For OCR (Tesseract):
 47 | ```bash
 48 | # Linux
 49 | sudo apt-get install tesseract-ocr
 50 | # macOS
 51 | brew install tesseract
 52 | # Windows: Download and install from https://github.com/UB-Mannheim/tesseract/wiki
 53 | ```
 54 | 
 55 | For PDF table extraction:
 56 | ```bash
 57 | # Linux
 58 | sudo apt-get install default-jre ghostscript
 59 | # macOS
 60 | brew install openjdk ghostscript
 61 | # Windows: Install Java Runtime Environment and Ghostscript
 62 | ```
 63 | 
 64 | ## Authentication Issues
 65 | 
 66 | ### Google Drive Authentication Problems
 67 | 
 68 | **Issue**: `google.auth.exceptions.DefaultCredentialsError`  
 69 | **Solution**: Ensure your service account credentials are correctly set up:
 70 | 1. Verify the `service-account.json` file exists in the `config/` directory
 71 | 2. Check that the Google Drive API is enabled in your Google Cloud project
 72 | 3. Ensure your service account has the necessary permissions on the target folders
 73 | 
 74 | **Issue**: Permission denied errors when accessing Google Drive  
 75 | **Solution**: 
 76 | 1. Ensure the service account has been granted access to the target folders/files
 77 | 2. Share the folders explicitly with the service account email address
 78 | 3. For shared drives, ensure the service account has been added to the shared drive
 79 | 
 80 | **Issue**: SSL connection errors with Google Drive API  
 81 | **Solution**:
 82 | 1. These are handled gracefully by the system and shouldn't cause failures
 83 | 2. If persistent, try updating your SSL certificates or temporarily disable SSL verification in your test environment
 84 | 
 85 | ## Supabase Issues
 86 | 
 87 | **Issue**: Connection errors to Supabase  
 88 | **Solution**: 
 89 | 1. Verify your Supabase URL and API key in the `.env` file
 90 | 2. Check that your IP address is not restricted in Supabase dashboard settings
 91 | 3. Test your connection with a simple query:
 92 | ```python
 93 | from supabase import create_client
 94 | import os
 95 | 
 96 | url = os.environ.get("SUPABASE_URL")
 97 | key = os.environ.get("SUPABASE_API_KEY")
 98 | supabase = create_client(url, key)
 99 | response = supabase.table('langchain_docs').select('id').limit(1).execute()
100 | print(response)
101 | ```
102 | 
103 | **Issue**: `relation "langchain_docs" does not exist`  
104 | **Solution**: 
105 | 1. Run the Supabase setup script:
106 | ```bash
107 | python -c "import setup_vector_store; setup_vector_store.setup()"
108 | ```
109 | 2. Or manually create the tables as described in [Supabase Setup Guide](supabase_setup.md)
110 | 
111 | **Issue**: pgvector extension errors  
112 | **Solution**:
113 | 1. Ensure the pgvector extension is enabled:
114 | ```sql
115 | CREATE EXTENSION IF NOT EXISTS vector;
116 | ```
117 | 2. Verify your Supabase plan supports extensions
118 | 
119 | ## Ingestion Issues
120 | 
121 | **Issue**: Memory errors when processing large files  
122 | **Solution**: 
123 | 1. Decrease the chunk size in your code:
124 | ```python
125 | ingestion = LangChainIngestion(chunk_size=300, chunk_overlap=30)
126 | ```
127 | 2. Process fewer files at a time
128 | 3. Increase your system's available memory
129 | 
130 | **Issue**: Slow ingestion speed  
131 | **Solution**:
132 | 1. Enable parallel processing with a reasonable worker count:
133 | ```python
134 | ingestion = LangChainIngestion(workers=4, use_parallel=True)
135 | ```
136 | 2. Use a simpler embedding model if available
137 | 3. Process only files that have changed since last ingestion
138 | 
139 | **Issue**: Timeout errors with large collections  
140 | **Solution**:
141 | 1. Increase the timeout setting:
142 | ```python
143 | ingestion = LangChainIngestion(timeout=180)  # 3 minutes timeout
144 | ```
145 | 2. Process in smaller batches
146 | 
147 | **Issue**: SSL connection errors during ingestion  
148 | **Solution**: These are automatically handled by the robust SSL error handling in the system
149 | 
150 | ## Query Issues
151 | 
152 | **Issue**: No results returned from queries  
153 | **Solution**:
154 | 1. Verify documents were successfully ingested (check the Supabase tables)
155 | 2. Try simpler queries
156 | 3. Increase the number of results requested:
157 | ```python
158 | results = query_langchain(query="your query", top_k=10)
159 | ```
160 | 4. Ensure your embedding model matches the one used during ingestion
161 | 
162 | **Issue**: Irrelevant results  
163 | **Solution**:
164 | 1. Reformulate your query to be more specific
165 | 2. Try using metadata filters to narrow results:
166 | ```python
167 | results = query_langchain(
168 |     query="your query",
169 |     filter_metadata={"source_type": "pdf"}
170 | )
171 | ```
172 | 3. Adjust the similarity threshold if applicable
173 | 
174 | **Issue**: Slow query performance  
175 | **Solution**:
176 | 1. Ensure you've created appropriate indexes in Supabase (see [Supabase Setup Guide](supabase_setup.md))
177 | 2. Use smaller result sets (lower `top_k` values)
178 | 3. Apply metadata filters to reduce the search space
179 | 
180 | ## OpenAI API Issues
181 | 
182 | **Issue**: OpenAI API authentication errors  
183 | **Solution**:
184 | 1. Ensure your OpenAI API key is correctly set in the `.env` file
185 | 2. Verify your key has not expired and has sufficient quota
186 | 3. Check network connectivity to the OpenAI API
187 | 
188 | **Issue**: `RateLimitError` from OpenAI  
189 | **Solution**:
190 | 1. Implement exponential backoff:
191 | ```python
192 | import time
193 | import random
194 | for attempt in range(5):
195 |     try:
196 |         # Your OpenAI API call
197 |         break
198 |     except RateLimitError:
199 |         time.sleep((2 ** attempt) + random.random())
200 | ```
201 | 2. Reduce parallel requests
202 | 3. Upgrade your OpenAI API plan
203 | 
204 | ## Specific Feature Issues
205 | 
206 | ### OCR Issues
207 | 
208 | **Issue**: OCR not extracting text correctly  
209 | **Solution**:
210 | 1. Ensure Tesseract is properly installed
211 | 2. Check image quality and resolution
212 | 3. Try preprocessing the image (the system does this automatically)
213 | 4. For languages other than English, install language packs for Tesseract
214 | 
215 | ### PDF Table Extraction Issues
216 | 
217 | **Issue**: Tables not being extracted correctly  
218 | **Solution**:
219 | 1. Ensure Java Runtime Environment and Ghostscript are properly installed
220 | 2. Try different extraction methods (the system attempts multiple methods automatically)
221 | 3. For complex tables, consider pre-processing the PDFs
222 | 
223 | ## Getting Additional Help
224 | 
225 | If you're still experiencing issues:
226 | 
227 | 1. Check for similar issues in the [GitHub repository](https://github.com/Getting-Automated/n8n-rag-example/issues)
228 | 2. Enable debug logging for more detailed information:
229 | ```python
230 | import logging
231 | logging.basicConfig(level=logging.DEBUG)
232 | ```
233 | 3. When reporting issues, include:
234 |    - Complete error messages and stack traces
235 |    - Steps to reproduce the problem
236 |    - Your environment details (OS, Python version, etc.)
237 |    - Specific file types and sizes causing issues 


--------------------------------------------------------------------------------
/rag/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | N8N RAG Example Package
 4 | 
 5 | This package provides tools and utilities for implementing RAG patterns using LangChain
 6 | and Supabase with documents from Google Drive.
 7 | """
 8 | 
 9 | __version__ = "0.1.0"
10 | 
11 | from .auth import GoogleDriveAuth
12 | from .ingestion import LangChainIngestion
13 | from .query import query_langchain
14 | from .store import SupabaseStore
15 | 
16 | # For backwards compatibility
17 | import sys
18 | import os
19 | 
20 | # Add compat imports for old structure users
21 | sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))


--------------------------------------------------------------------------------
/rag/auth.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """Authentication utilities for Google services"""
  3 | 
  4 | import os
  5 | import json
  6 | import logging
  7 | from typing import Optional, Dict, Any
  8 | from dotenv import load_dotenv
  9 | 
 10 | # Configure logging
 11 | logging.basicConfig(
 12 |     level=logging.INFO,
 13 |     format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
 14 | )
 15 | logger = logging.getLogger(__name__)
 16 | 
 17 | # Load environment variables
 18 | load_dotenv()
 19 | 
 20 | try:
 21 |     from google.oauth2.credentials import Credentials
 22 |     from google_auth_oauthlib.flow import InstalledAppFlow
 23 |     from google.auth.transport.requests import Request
 24 |     from googleapiclient.discovery import build
 25 |     from google.oauth2 import service_account
 26 | except ImportError:
 27 |     logger.warning("Google API libraries not installed. Run: pip install google-api-python-client google-auth-httplib2 google-auth-oauthlib")
 28 | 
 29 | class GoogleDriveAuth:
 30 |     """Authentication handler for Google Drive API."""
 31 |     
 32 |     def __init__(
 33 |         self, 
 34 |         client_id: Optional[str] = None,
 35 |         client_secret: Optional[str] = None,
 36 |         refresh_token: Optional[str] = None,
 37 |         credentials_path: Optional[str] = None,
 38 |         service_account_path: Optional[str] = None,
 39 |         scopes: Optional[list] = None
 40 |     ):
 41 |         """Initialize the Google Drive authentication.
 42 |         
 43 |         Args:
 44 |             client_id: Google API client ID
 45 |             client_secret: Google API client secret
 46 |             refresh_token: OAuth refresh token
 47 |             credentials_path: Path to credentials.json file
 48 |             service_account_path: Path to service account JSON file
 49 |             scopes: OAuth scopes to request
 50 |         """
 51 |         # Default scopes for Google Drive
 52 |         self.scopes = scopes or [
 53 |             'https://www.googleapis.com/auth/drive.readonly',
 54 |             'https://www.googleapis.com/auth/drive.metadata.readonly'
 55 |         ]
 56 |         
 57 |         # Store auth parameters
 58 |         self.client_id = client_id or os.getenv("GOOGLE_CLIENT_ID")
 59 |         self.client_secret = client_secret or os.getenv("GOOGLE_CLIENT_SECRET")
 60 |         self.refresh_token = refresh_token or os.getenv("GOOGLE_REFRESH_TOKEN")
 61 |         
 62 |         # Get paths to credential files
 63 |         module_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))  # clean_repo directory
 64 |         self.credentials_path = credentials_path or os.getenv("GOOGLE_CREDENTIALS_PATH") or os.path.join(module_dir, "config/credentials.json")
 65 |         self.service_account_path = service_account_path or os.getenv("GOOGLE_SERVICE_ACCOUNT_PATH") or os.path.join(module_dir, "config/service-account.json")
 66 |         
 67 |         # Initialize credentials
 68 |         self.credentials = None
 69 |         self.service = None
 70 |         
 71 |     def get_credentials(self) -> Credentials:
 72 |         """Get OAuth2 credentials for Google API.
 73 |         
 74 |         Returns:
 75 |             Google OAuth2 credentials object
 76 |         """
 77 |         if self.credentials:
 78 |             return self.credentials
 79 |             
 80 |         # Try to create credentials from various sources
 81 |         if self.refresh_token and self.client_id and self.client_secret:
 82 |             logger.info("Creating credentials from refresh token")
 83 |             # Create credentials from environment variables
 84 |             self.credentials = Credentials(
 85 |                 None,  # No access token initially
 86 |                 refresh_token=self.refresh_token,
 87 |                 token_uri="https://oauth2.googleapis.com/token",
 88 |                 client_id=self.client_id,
 89 |                 client_secret=self.client_secret,
 90 |                 scopes=self.scopes
 91 |             )
 92 |         elif os.path.exists(self.service_account_path):
 93 |             logger.info(f"Creating credentials from service account: {self.service_account_path}")
 94 |             # Create credentials from service account
 95 |             self.credentials = service_account.Credentials.from_service_account_file(
 96 |                 self.service_account_path, 
 97 |                 scopes=self.scopes
 98 |             )
 99 |         elif os.path.exists(self.credentials_path):
100 |             logger.info(f"Creating credentials from OAuth file: {self.credentials_path}")
101 |             # Try to load credentials from local file
102 |             try:
103 |                 with open(self.credentials_path, 'r') as f:
104 |                     creds_data = json.load(f)
105 |                 
106 |                 # Check if this is a credentials.json format or token.json format
107 |                 if 'installed' in creds_data:
108 |                     # This is a credentials.json file - we need to generate a token
109 |                     flow = InstalledAppFlow.from_client_secrets_file(
110 |                         self.credentials_path, self.scopes)
111 |                     self.credentials = flow.run_local_server(port=0)
112 |                     
113 |                     # Save the credentials for future use
114 |                     token_path = os.path.join(os.path.dirname(self.credentials_path), 'token.json')
115 |                     with open(token_path, 'w') as token:
116 |                         token.write(self.credentials.to_json())
117 |                         logger.info(f"Saved credentials to {token_path}")
118 |                 else:
119 |                     # This appears to be a token.json format
120 |                     self.credentials = Credentials.from_authorized_user_info(creds_data, self.scopes)
121 |             except Exception as e:
122 |                 logger.error(f"Error loading credentials from file: {str(e)}")
123 |                 raise
124 |         else:
125 |             logger.error("No valid credentials found. Please set up Google authentication.")
126 |             raise ValueError(
127 |                 "No valid credentials found. Please provide client_id/client_secret/refresh_token, "
128 |                 "or a path to credentials.json or service-account.json."
129 |             )
130 |         
131 |         # Check if credentials need refreshing
132 |         if self.credentials and self.credentials.expired and hasattr(self.credentials, 'refresh_token') and self.credentials.refresh_token:
133 |             logger.info("Refreshing expired credentials")
134 |             self.credentials.refresh(Request())
135 |             
136 |         return self.credentials
137 |         
138 |     def get_drive_service(self):
139 |         """Get the Google Drive API service.
140 |         
141 |         Returns:
142 |             Google Drive API service instance
143 |         """
144 |         if not self.service:
145 |             try:
146 |                 # Get credentials
147 |                 creds = self.get_credentials()
148 |                 
149 |                 # Use the standard build function with default parameters
150 |                 from googleapiclient.discovery import build
151 |                 
152 |                 # Print friendly message about potential SSL issues
153 |                 logger.info("Connecting to Google Drive API (SSL connection issues will be handled gracefully)")
154 |                 
155 |                 # Create Drive API service
156 |                 self.service = build('drive', 'v3', credentials=creds)
157 |                 logger.info("Successfully created Google Drive service")
158 |             except Exception as e:
159 |                 # If it's an SSL error, provide a more reassuring message
160 |                 if "SSL:" in str(e):
161 |                     logger.info("SSL connection issue detected while connecting to Google Drive API - we will fall back to local processing")
162 |                     logger.debug(f"SSL details: {str(e)}")
163 |                 else:
164 |                     logger.error(f"Error creating Google Drive service: {str(e)}")
165 |                 raise
166 |                 
167 |         return self.service
168 |         
169 |     def get_downloader(self, request, file_obj):
170 |         """Get a media downloader with enhanced error handling.
171 |         
172 |         Args:
173 |             request: The media request to download
174 |             file_obj: The file object to write to
175 |             
176 |         Returns:
177 |             A MediaIoBaseDownload instance
178 |         """
179 |         from googleapiclient.http import MediaIoBaseDownload
180 |         
181 |         # Create a downloader with increased chunk size for better performance
182 |         downloader = MediaIoBaseDownload(
183 |             file_obj, 
184 |             request,
185 |             chunksize=1024*1024  # 1MB chunks
186 |         )
187 |         
188 |         return downloader
189 |         
190 |     def call_api(self, method_name, **kwargs):
191 |         """Wrapper for Drive API calls with basic error handling.
192 |         
193 |         Args:
194 |             method_name: String like 'files.get' to identify the method
195 |             **kwargs: Arguments to pass to the API method
196 |             
197 |         Returns:
198 |             The API response
199 |         """
200 |         # Get service
201 |         service = self.get_drive_service()
202 |         
203 |         # Parse method path
204 |         parts = method_name.split('.')
205 |         resource = service
206 |         for part in parts[:-1]:
207 |             resource = getattr(resource, part)()
208 |         
209 |         # Get the method
210 |         method = getattr(resource, parts[-1])
211 |         
212 |         # Log call
213 |         logger.info(f"Making Drive API call: {method_name}")
214 |         
215 |         # Make the API call
216 |         response = method(**kwargs).execute()
217 |         return response


--------------------------------------------------------------------------------
/rag/ingestion.py:
--------------------------------------------------------------------------------
   1 | #!/usr/bin/env python3
   2 | """Document ingestion functionality for RAG systems."""
   3 | 
   4 | import os
   5 | import logging
   6 | from typing import List, Dict, Any, Optional, Union, Callable, Type, Tuple
   7 | from datetime import datetime
   8 | from dotenv import load_dotenv
   9 | import ssl
  10 | import time
  11 | import uuid
  12 | import json
  13 | import socket
  14 | 
  15 | # Import core modules
  16 | from .auth import GoogleDriveAuth
  17 | from .store import SupabaseStore
  18 | from .utils import DocumentProcessor
  19 | 
  20 | # Set a reasonable socket timeout for API calls
  21 | socket.setdefaulttimeout(60)  # 60 seconds timeout
  22 | 
  23 | # LangChain imports
  24 | try:
  25 |     # Try importing from recommended packages
  26 |     from langchain_community.document_loaders import UnstructuredFileIOLoader
  27 |     from langchain_text_splitters import RecursiveCharacterTextSplitter
  28 |     from langchain_openai import OpenAIEmbeddings
  29 |     from langchain_core.documents import Document
  30 |     
  31 |     # Check if langchain_google_community is installed, if not install it
  32 |     try:
  33 |         from langchain_google_community.drive import GoogleDriveLoader
  34 |     except ImportError:
  35 |         import subprocess
  36 |         import sys
  37 |         print("Installing langchain-google-community...")
  38 |         subprocess.check_call([sys.executable, "-m", "pip", "install", "langchain-google-community"])
  39 |         from langchain_google_community.drive import GoogleDriveLoader
  40 |         
  41 | except ImportError:
  42 |     logging.warning("LangChain libraries not installed. Run: pip install langchain langchain_community langchain_openai langchain-google-community langchain-text-splitters")
  43 |     # Define stub classes for when imports fail
  44 |     class Document:
  45 |         def __init__(self, page_content="", metadata=None):
  46 |             self.page_content = page_content
  47 |             self.metadata = metadata or {}
  48 |     
  49 |     # Define OpenAIEmbeddings as a stub class
  50 |     class OpenAIEmbeddings:
  51 |         def __init__(self, *args, **kwargs):
  52 |             pass
  53 |         
  54 |         def embed_documents(self, texts):
  55 |             logging.warning("Using stub OpenAIEmbeddings - no actual embedding will occur")
  56 |             return [[0.0] * 1536 for _ in texts]  # Return empty vectors
  57 | 
  58 | # Configure logging
  59 | logging.basicConfig(
  60 |     level=logging.INFO,
  61 |     format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
  62 | )
  63 | logger = logging.getLogger(__name__)
  64 | 
  65 | # Load environment variables
  66 | load_dotenv()
  67 | 
  68 | def fetch_direct_permissions(file_id: str, google_auth: Any) -> List[Dict[str, Any]]:
  69 |     """Fetch permissions directly using Drive API with enhanced error handling.
  70 |     
  71 |     Args:
  72 |         file_id: Google Drive file ID
  73 |         google_auth: GoogleDriveAuth instance
  74 |         
  75 |     Returns:
  76 |         List of permission objects with user details
  77 |     """
  78 |     try:
  79 |         # Get the Drive service
  80 |         service = google_auth.get_drive_service()
  81 |         
  82 |         # Use fields approach for optimal performance
  83 |         file = service.files().get(
  84 |             fileId=file_id,
  85 |             fields="permissions(id,type,emailAddress,role,displayName)"
  86 |         ).execute()
  87 |         
  88 |         # Extract and return permissions
  89 |         permissions = file.get("permissions", [])
  90 |         logger.info(f"Successfully fetched {len(permissions)} permissions for {file_id}")
  91 |         return permissions
  92 |         
  93 |     except Exception as e:
  94 |         # Check for SSL errors
  95 |         if isinstance(e, ssl.SSLError) or "SSL" in str(e):
  96 |             logger.info(f"SSL connection issue detected - using basic metadata for {file_id} (this is normal and handled gracefully)")
  97 |         else:
  98 |             logger.error(f"Error fetching permissions for {file_id}: {str(e)}")
  99 |         return []
 100 | 
 101 | class LangChainIngestion:
 102 |     """Document ingestion pattern using LangChain."""
 103 |     
 104 |     def __init__(
 105 |         self, 
 106 |         collection_name: str = "langchain_docs"
 107 |     ):
 108 |         """Initialize the LangChain ingestion pattern.
 109 |         
 110 |         Args:
 111 |             collection_name: Name of the collection to store documents
 112 |         """
 113 |         self.collection_name = collection_name
 114 |         
 115 |         # Initialize Google Drive auth
 116 |         self.google_auth = GoogleDriveAuth()
 117 |         
 118 |         # Initialize embeddings
 119 |         self.embeddings = OpenAIEmbeddings()
 120 |         
 121 |         # Initialize Supabase store
 122 |         try:
 123 |             self.supabase_store = SupabaseStore(collection_name=self.collection_name)
 124 |             self.vectorstore = self.supabase_store.get_langchain_store(self.embeddings)
 125 |             logger.info("Successfully initialized Supabase store and vector store")
 126 |         except Exception as e:
 127 |             logger.error(f"Failed to initialize Supabase store: {str(e)}")
 128 |             raise
 129 |     
 130 |     def load_documents_from_drive(
 131 |         self, 
 132 |         folder_id: Optional[str] = None,
 133 |         file_ids: Optional[List[str]] = None,
 134 |         file_types: Optional[List[str]] = None,
 135 |         recursive: bool = False,
 136 |         load_auth: bool = False,
 137 |         load_extended_metadata: bool = False,
 138 |         custom_query: Optional[str] = None,
 139 |         file_loader_cls: Optional[Type] = None,
 140 |         file_loader_kwargs: Optional[Dict[str, Any]] = None,
 141 |         max_results: Optional[int] = None
 142 |     ) -> List:
 143 |         """Load documents from Google Drive.
 144 |         
 145 |         Args:
 146 |             folder_id: ID of the Google Drive folder
 147 |             file_ids: List of specific file IDs to load (optional)
 148 |             file_types: List of file extensions to include (e.g., ['pdf', 'docx'])
 149 |             recursive: Whether to recursively search subfolders
 150 |             load_auth: Whether to load auth identities
 151 |             load_extended_metadata: Whether to load extended metadata
 152 |             custom_query: Custom query string for Google Drive search
 153 |             file_loader_cls: Optional file loader class for non-Google document types
 154 |             file_loader_kwargs: Optional kwargs for the file loader
 155 |             max_results: Maximum number of results to return
 156 |             
 157 |         Returns:
 158 |             List of LangChain documents
 159 |         """
 160 |         logger.info("Loading documents from Google Drive")
 161 |         
 162 |         # Get credentials
 163 |         creds = self.google_auth.get_credentials()
 164 |         service = self.google_auth.get_drive_service()
 165 |         
 166 |         # Verify the folder exists and we have access
 167 |         if folder_id:
 168 |             try:
 169 |                 folder = service.files().get(
 170 |                     fileId=folder_id,
 171 |                     supportsAllDrives=True
 172 |                 ).execute()
 173 |                 logger.info(f"Successfully accessed folder: {folder.get('name', folder_id)}")
 174 |             except Exception as e:
 175 |                 logger.error(f"Error accessing folder {folder_id}: {str(e)}")
 176 |                 raise ValueError(f"Could not access folder with ID {folder_id}: {str(e)}")
 177 |         
 178 |         # List files in the folder to manually process if needed
 179 |         folder_files = []
 180 |         documents = []  # Initialize the documents list before use
 181 |         
 182 |         if folder_id:
 183 |             try:
 184 |                 query = f"'{folder_id}' in parents and trashed=false"
 185 |                 files = service.files().list(
 186 |                     q=query,
 187 |                     supportsAllDrives=True,
 188 |                     fields="files(id, name, mimeType, createdTime, modifiedTime)"
 189 |                 ).execute()
 190 |                 folder_files = files.get('files', [])
 191 |                 logger.info(f"Found {len(folder_files)} files in the specified folder")
 192 |                 for item in folder_files[:5]:  # Show first 5 files for debugging
 193 |                     logger.info(f"  - {item.get('name')} ({item.get('id')}) [MIME: {item.get('mimeType')}]")
 194 |             except Exception as e:
 195 |                 logger.error(f"Error listing files in folder {folder_id}: {str(e)}")
 196 |         
 197 |         # Specially handle DOCX files which can have issues with LangChain loader
 198 |         docx_files = [file for file in folder_files if file.get('mimeType') == 
 199 |                      'application/vnd.openxmlformats-officedocument.wordprocessingml.document']
 200 |         
 201 |         if docx_files:
 202 |             logger.info(f"Found {len(docx_files)} DOCX files to manually process")
 203 |             
 204 |             # Process DOCX files manually
 205 |             import io
 206 |             import docx2txt
 207 |             from googleapiclient.http import MediaIoBaseDownload
 208 |             
 209 |             for file in docx_files:
 210 |                 file_id = file.get('id')
 211 |                 file_name = file.get('name')
 212 |                 
 213 |                 try:
 214 |                     # Download the file content
 215 |                     request = service.files().get_media(fileId=file_id)
 216 |                     file_content = io.BytesIO()
 217 |                     downloader = MediaIoBaseDownload(file_content, request)
 218 |                     done = False
 219 |                     while not done:
 220 |                         status, done = downloader.next_chunk()
 221 |                     
 222 |                     # Reset the pointer to the beginning of the file content
 223 |                     file_content.seek(0)
 224 |                     
 225 |                     # Extract text using docx2txt
 226 |                     text = docx2txt.process(file_content)
 227 |                     
 228 |                     # Create metadata
 229 |                     metadata = {
 230 |                         "source": f"google-drive://{file_id}",
 231 |                         "file_id": file_id,
 232 |                         "file_name": file_name,
 233 |                         "mime_type": file.get('mimeType'),
 234 |                         "created_time": file.get('createdTime'),
 235 |                         "modified_time": file.get('modifiedTime')
 236 |                     }
 237 |                     
 238 |                     # Create document
 239 |                     doc = Document(page_content=text, metadata=metadata)
 240 |                     documents.append(doc)
 241 |                     logger.info(f"Successfully processed DOCX file: {file_name}")
 242 |                     
 243 |                 except Exception as e:
 244 |                     logger.error(f"Error processing DOCX file {file_name}: {str(e)}")
 245 |                     continue
 246 |         
 247 |         # Initialize loader kwargs
 248 |         loader_kwargs = {
 249 |             "credentials": creds,
 250 |             "recursive": recursive
 251 |         }
 252 |         
 253 |         if folder_id:
 254 |             loader_kwargs["folder_id"] = folder_id
 255 |             logger.info(f"Targeting folder ID: {folder_id}")
 256 |         elif file_ids:
 257 |             loader_kwargs["file_ids"] = file_ids
 258 |             logger.info(f"Targeting specific files: {file_ids}")
 259 |         else:
 260 |             raise ValueError("Either folder_id or file_ids must be provided")
 261 |         
 262 |         # Add file_types if provided
 263 |         if file_types:
 264 |             loader_kwargs["file_types"] = file_types
 265 |             logger.info(f"Filtering by file types: {file_types}")
 266 |         
 267 |         # Add auth loading if requested
 268 |         if load_auth:
 269 |             loader_kwargs["load_auth"] = True
 270 |             
 271 |         # Add extended metadata loading if requested
 272 |         if load_extended_metadata:
 273 |             loader_kwargs["load_extended_metadata"] = True
 274 |             
 275 |         # Disable automatic permissions fetching through LangChain to avoid SSL issues
 276 |         # We'll fetch permissions directly using our optimized method
 277 |         loader_kwargs["load_permissions"] = False
 278 |         
 279 |         # Add custom query if provided
 280 |         if custom_query:
 281 |             loader_kwargs["query"] = custom_query
 282 |         
 283 |         # Add file loader if provided
 284 |         if file_loader_cls:
 285 |             loader_kwargs["file_loader_cls"] = file_loader_cls
 286 |             if file_loader_kwargs:
 287 |                 loader_kwargs["file_loader_kwargs"] = file_loader_kwargs
 288 |         
 289 |         # Try using LangChain's loaders for non-DOCX files
 290 |         non_docx_files_count = len(folder_files) - len(docx_files)
 291 |         if non_docx_files_count > 0 or file_ids:
 292 |             # Try to use the community GoogleDriveLoader
 293 |             try:
 294 |                 from langchain_google_community.drive import GoogleDriveLoader
 295 |                 
 296 |                 # Print the absolute path to help with debugging
 297 |                 if 'folder_id' in loader_kwargs:
 298 |                     logger.info(f"GoogleDriveLoader targeting folder: {loader_kwargs['folder_id']}")
 299 |                 elif 'file_ids' in loader_kwargs:
 300 |                     logger.info(f"GoogleDriveLoader targeting files: {loader_kwargs['file_ids']}")
 301 |                 
 302 |                 # Try using service account explicitly - using the same one our GoogleDriveAuth class is using
 303 |                 service_account_path = self.google_auth.service_account_path
 304 |                 
 305 |                 # Check if service account file exists
 306 |                 if os.path.exists(service_account_path):
 307 |                     logger.info(f"Service account file exists: {service_account_path}")
 308 |                     # Get service account credentials and pass explicitly
 309 |                     try:
 310 |                         from google.oauth2 import service_account
 311 |                         credentials = service_account.Credentials.from_service_account_file(
 312 |                             service_account_path,
 313 |                             scopes=['https://www.googleapis.com/auth/drive.readonly',
 314 |                                     'https://www.googleapis.com/auth/drive.metadata.readonly']
 315 |                         )
 316 |                         
 317 |                         # Add credentials to loader kwargs
 318 |                         loader_kwargs["credentials"] = credentials
 319 |                         
 320 |                         # Explicitly disable service account usage - we'll use our credentials directly
 321 |                         loader_kwargs["service_account_path"] = None
 322 |                         
 323 |                         # Set token_file to None to prevent looking for credentials.json
 324 |                         loader_kwargs["token_file"] = None
 325 |                     except ImportError as e:
 326 |                         logger.error(f"Error importing Google service_account module: {str(e)}")
 327 |                         logger.error("This likely means the google-auth package is not installed correctly.")
 328 |                         logger.error("Try running: pip install google-auth")
 329 |                 else:
 330 |                     logger.warning(f"Service account file not found at: {service_account_path}")
 331 |                 
 332 |                 # Initialize the loader with our kwargs
 333 |                 loader = GoogleDriveLoader(**loader_kwargs)
 334 |                 if max_results:
 335 |                     loader.num_results = max_results
 336 |                 logger.info("Using GoogleDriveLoader from langchain-google-community")
 337 |                 
 338 |             except ImportError as e:
 339 |                 logger.error(f"Failed to import GoogleDriveLoader: {str(e)}")
 340 |                 logger.error("This likely means langchain-google-community is not installed correctly.")
 341 |                 logger.error("Using sample documents for demonstration.")
 342 |                 
 343 |                 # Create sample documents for demonstration
 344 |                 for i in range(3):
 345 |                     sample_doc = Document(
 346 |                         page_content=f"This is sample document {i+1} (import error). The GoogleDriveLoader could not be imported. " + 
 347 |                                     f"Please install langchain-google-community with: pip install langchain-google-community",
 348 |                         metadata={
 349 |                             "source": f"import-error-{i+1}",
 350 |                             "file_id": f"import-error-{i+1}",
 351 |                             "file_name": f"Import Error Sample {i+1}.txt",
 352 |                             "mime_type": "text/plain",
 353 |                             "created_time": datetime.now().isoformat(),
 354 |                             "modified_time": datetime.now().isoformat()
 355 |                         }
 356 |                     )
 357 |                     documents.append(sample_doc)
 358 |                     
 359 |                 logger.info(f"Created {len(documents)} import error sample documents")
 360 |                 return documents
 361 |             
 362 |             # Load documents from LangChain loader
 363 |             try:
 364 |                 langchain_docs = loader.load()
 365 |                 logger.info(f"Loaded {len(langchain_docs)} documents from LangChain GoogleDriveLoader")
 366 |                 
 367 |                 # Log some details about the first few documents
 368 |                 for i, doc in enumerate(langchain_docs[:3]):
 369 |                     if i >= 3:
 370 |                         break
 371 |                     logger.info(f"Document {i+1} - Source: {doc.metadata.get('source', 'unknown')}")
 372 |                     logger.info(f"  Metadata: {str(list(doc.metadata.keys()))}")
 373 |                     content_preview = doc.page_content[:100] + "..." if len(doc.page_content) > 100 else doc.page_content
 374 |                     logger.info(f"  Content preview: {content_preview}")
 375 |                 
 376 |                 # Add LangChain loaded documents to our manually processed ones
 377 |                 documents.extend(langchain_docs)
 378 |                 
 379 |             except Exception as e:
 380 |                 error_msg = str(e)
 381 |                 if "credentials.json was not found" in error_msg or "/config/credentials.json" in error_msg:
 382 |                     logger.warning("Google Drive credentials issue detected. Using sample documents for demonstration.")
 383 |                     
 384 |                     # Create sample documents for demonstration purposes
 385 |                     for i in range(3):
 386 |                         sample_doc = Document(
 387 |                             page_content=f"This is sample document {i+1} (fallback). It contains text that demonstrates the RAG pipeline functionality. " + 
 388 |                                         f"You can replace this with your actual Google Drive documents by configuring the proper credentials.",
 389 |                             metadata={
 390 |                                 "source": f"fallback-sample-{i+1}",
 391 |                                 "file_id": f"fallback-id-{i+1}",
 392 |                                 "file_name": f"Fallback Sample {i+1}.txt",
 393 |                                 "mime_type": "text/plain",
 394 |                                 "created_time": datetime.now().isoformat(),
 395 |                                 "modified_time": datetime.now().isoformat()
 396 |                             }
 397 |                         )
 398 |                         documents.append(sample_doc)
 399 |                         
 400 |                     logger.info(f"Created {len(documents)} fallback sample documents for demonstration")
 401 |                 else:
 402 |                     logger.error(f"Error loading documents with LangChain: {error_msg}")
 403 |                     raise
 404 |         
 405 |         # Report total documents found
 406 |         logger.info(f"Total documents loaded: {len(documents)}")
 407 |         if len(documents) == 0:
 408 |             logger.warning("No documents found. Check the folder ID and permissions.")
 409 |             
 410 |         return documents
 411 |     
 412 |     def process_document(self, document: Document) -> Tuple[Optional[str], Optional[Dict[str, Any]]]:
 413 |         """Process a single document to extract text and metadata.
 414 |         
 415 |         Args:
 416 |             document: LangChain document object
 417 |             
 418 |         Returns:
 419 |             Tuple of (text, metadata)
 420 |         """
 421 |         try:
 422 |             # Extract the text content
 423 |             text = document.page_content
 424 |             
 425 |             # Extract the base metadata
 426 |             base_metadata = document.metadata
 427 |             
 428 |             # Ensure file_id is present
 429 |             file_id = base_metadata.get('file_id', None)
 430 |             if not file_id:
 431 |                 # Try to extract from source
 432 |                 source = base_metadata.get('source', '')
 433 |                 if '://' in source:
 434 |                     file_id = source.split('://')[-1]
 435 |                     base_metadata['file_id'] = file_id
 436 |             
 437 |             # Check if this is a Google Drive document
 438 |             source_type = 'google_drive' if file_id else 'unknown'
 439 |             
 440 |             # Get drive item and auth info if this is a Google Drive document
 441 |             drive_item = None
 442 |             auth_info = None
 443 |             
 444 |             if file_id and source_type == 'google_drive':
 445 |                 try:
 446 |                     # Get Drive service
 447 |                     drive_service = self.google_auth.get_drive_service()
 448 |                     
 449 |                     # Use our new optimized direct permissions fetching with shorter timeout
 450 |                     from rag.utils import get_drive_permissions
 451 |                     
 452 |                     # Get permissions with enhanced SSL and timeout handling
 453 |                     permissions = get_drive_permissions(
 454 |                         file_id=file_id,
 455 |                         drive_service=drive_service,
 456 |                         use_unverified_context=True,  # Use unverified SSL context to avoid SSL errors
 457 |                         timeout=15  # Use a shorter timeout to prevent hanging
 458 |                     )
 459 |                     
 460 |                     # Create a simple drive item with essential metadata
 461 |                     drive_item = {
 462 |                         'id': file_id,
 463 |                         'name': base_metadata.get('file_name', 'Unknown'),
 464 |                         'mimeType': base_metadata.get('mime_type', 'Unknown'),
 465 |                         'createdTime': base_metadata.get('created_time', ''),
 466 |                         'modifiedTime': base_metadata.get('modified_time', ''),
 467 |                     }
 468 |                     
 469 |                     # Structure auth info with permissions
 470 |                     auth_info = {
 471 |                         'permissions': permissions
 472 |                     }
 473 |                     
 474 |                     logger.info(f"Successfully fetched {len(permissions)} permissions for {file_id}")
 475 |                     
 476 |                 except Exception as e:
 477 |                     logger.warning(f"Error fetching Google Drive permissions: {str(e)}")
 478 |                     # Create minimal structures even if permissions fetch fails
 479 |                     drive_item = {
 480 |                         'id': file_id,
 481 |                         'name': base_metadata.get('file_name', 'Unknown'),
 482 |                         'mimeType': base_metadata.get('mime_type', 'Unknown'),
 483 |                         'createdTime': base_metadata.get('created_time', ''),
 484 |                         'modifiedTime': base_metadata.get('modified_time', ''),
 485 |                     }
 486 |                     auth_info = {
 487 |                         'permissions': []
 488 |                     }
 489 |         
 490 |             else:
 491 |                 # No file_id or auth
 492 |                 drive_item = None
 493 |                 auth_info = None
 494 |         
 495 |             # Enrich metadata with Google Drive specific fields - if we have them
 496 |             metadata = DocumentProcessor.extract_google_drive_metadata(
 497 |                 base_metadata=base_metadata,
 498 |                 drive_item=drive_item,
 499 |                 auth_info=auth_info
 500 |             )
 501 |             
 502 |             # Add standard metadata fields
 503 |             metadata = DocumentProcessor.enrich_metadata(
 504 |                 metadata=metadata,
 505 |                 doc_type='google_drive',
 506 |                 additional_metadata={
 507 |                     'ingestion_engine': 'langchain',
 508 |                     'collection_name': self.collection_name
 509 |                 }
 510 |             )
 511 |             
 512 |             return text, metadata
 513 |         
 514 |         except Exception as e:
 515 |             logger.error(f"Error processing document: {str(e)}")
 516 |             return None, None
 517 |     
 518 |     def process_image_ocr(self, image_data: bytes, file_name: str, file_id: str, mime_type: str) -> Tuple[str, Dict[str, Any]]:
 519 |         """Process an image file with OCR to extract text.
 520 |         
 521 |         Args:
 522 |             image_data: Raw binary image data
 523 |             file_name: Name of the file
 524 |             file_id: ID of the file
 525 |             mime_type: MIME type of the file
 526 |             
 527 |         Returns:
 528 |             Tuple of (extracted_text, metadata with OCR info)
 529 |         """
 530 |         try:
 531 |             import io
 532 |             from PIL import Image
 533 |             import pytesseract
 534 |             import cv2
 535 |             import numpy as np
 536 |             
 537 |             # Create metadata
 538 |             metadata = {
 539 |                 "source": f"google-drive://{file_id}",
 540 |                 "file_id": file_id,
 541 |                 "file_name": file_name,
 542 |                 "mime_type": mime_type,
 543 |                 "extraction_method": "ocr",
 544 |                 "ocr_engine": "pytesseract",
 545 |                 "content_type": "image"
 546 |             }
 547 |             
 548 |             # Convert binary data to PIL Image
 549 |             image = Image.open(io.BytesIO(image_data))
 550 |             
 551 |             # Save image dimensions in metadata
 552 |             metadata["image_width"] = image.width
 553 |             metadata["image_height"] = image.height
 554 |             
 555 |             # Convert to OpenCV format for preprocessing
 556 |             img_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
 557 |             
 558 |             # Preprocess image to improve OCR quality
 559 |             # Convert to grayscale
 560 |             gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
 561 |             # Apply adaptive thresholding
 562 |             processed = cv2.adaptiveThreshold(
 563 |                 gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2
 564 |             )
 565 |             
 566 |             # Extract text using pytesseract
 567 |             text = pytesseract.image_to_string(processed)
 568 |             
 569 |             # Add OCR confidence data
 570 |             ocr_data = pytesseract.image_to_data(processed, output_type=pytesseract.Output.DICT)
 571 |             avg_conf = sum(ocr_data['conf']) / len(ocr_data['conf']) if ocr_data['conf'] else 0
 572 |             metadata["ocr_confidence"] = avg_conf
 573 |             
 574 |             # Add OCR statistics
 575 |             word_count = len([word for word in ocr_data['text'] if word.strip()])
 576 |             metadata["ocr_word_count"] = word_count
 577 |             
 578 |             logger.info(f"Successfully extracted {word_count} words from image: {file_name}")
 579 |             
 580 |             return text, metadata
 581 |         
 582 |         except Exception as e:
 583 |             logger.error(f"Error processing image with OCR: {str(e)}")
 584 |             # Return minimal text and metadata in case of failure
 585 |             return f"[OCR PROCESSING ERROR for {file_name}: {str(e)}]", {
 586 |                 "source": f"google-drive://{file_id}",
 587 |                 "file_id": file_id,
 588 |                 "file_name": file_name,
 589 |                 "mime_type": mime_type,
 590 |                 "extraction_error": str(e)
 591 |             }
 592 |     
 593 |     def extract_tables_from_pdf(self, pdf_data: bytes, file_name: str, file_id: str, mime_type: str) -> Tuple[str, Dict[str, Any]]:
 594 |         """Extract tables from PDF files.
 595 |         
 596 |         Args:
 597 |             pdf_data: Raw binary PDF data
 598 |             file_name: Name of the file
 599 |             file_id: ID of the file
 600 |             mime_type: MIME type of the file
 601 |             
 602 |         Returns:
 603 |             Tuple of (extracted_table_text, metadata with table info)
 604 |         """
 605 |         try:
 606 |             import io
 607 |             import os
 608 |             import tempfile
 609 |             import pandas as pd
 610 |             
 611 |             # Create a temporary file to save the PDF
 612 |             with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as temp_file:
 613 |                 temp_path = temp_file.name
 614 |                 temp_file.write(pdf_data)
 615 |             
 616 |             try:
 617 |                 # Create metadata
 618 |                 metadata = {
 619 |                     "source": f"google-drive://{file_id}",
 620 |                     "file_id": file_id,
 621 |                     "file_name": file_name,
 622 |                     "mime_type": mime_type,
 623 |                     "extraction_method": "table",
 624 |                     "content_type": "pdf_tables"
 625 |                 }
 626 |                 
 627 |                 # First try with tabula-py
 628 |                 try:
 629 |                     import tabula
 630 |                     tables = tabula.read_pdf(temp_path, pages='all', multiple_tables=True)
 631 |                     metadata["table_extraction_engine"] = "tabula"
 632 |                     metadata["tables_count"] = len(tables)
 633 |                     
 634 |                     table_texts = []
 635 |                     for i, table in enumerate(tables):
 636 |                         table_texts.append(f"\n--- Table {i+1} ---\n")
 637 |                         table_texts.append(table.to_string(index=False))
 638 |                         
 639 |                     logger.info(f"Successfully extracted {len(tables)} tables with tabula from: {file_name}")
 640 |                     
 641 |                 except Exception as tabula_error:
 642 |                     logger.warning(f"Tabula extraction failed: {str(tabula_error)}, trying camelot")
 643 |                     
 644 |                     # Fall back to camelot
 645 |                     try:
 646 |                         import camelot
 647 |                         tables = camelot.read_pdf(temp_path, pages='all')
 648 |                         metadata["table_extraction_engine"] = "camelot"
 649 |                         metadata["tables_count"] = len(tables)
 650 |                         
 651 |                         table_texts = []
 652 |                         for i, table in enumerate(tables):
 653 |                             table_texts.append(f"\n--- Table {i+1} ---\n")
 654 |                             table_texts.append(table.df.to_string(index=False))
 655 |                             
 656 |                         logger.info(f"Successfully extracted {len(tables)} tables with camelot from: {file_name}")
 657 |                         
 658 |                     except Exception as camelot_error:
 659 |                         logger.error(f"Camelot extraction also failed: {str(camelot_error)}")
 660 |                         return f"[TABLE EXTRACTION ERROR for {file_name}: {str(camelot_error)}]", metadata
 661 |                 
 662 |                 # Join all table texts
 663 |                 all_tables_text = "\n\n".join(table_texts)
 664 |                 
 665 |                 # Add tables text statistics
 666 |                 metadata["tables_text_length"] = len(all_tables_text)
 667 |                 metadata["tables_text_lines"] = all_tables_text.count('\n')
 668 |                 
 669 |                 return all_tables_text, metadata
 670 |                 
 671 |             finally:
 672 |                 # Clean up the temporary file
 673 |                 if os.path.exists(temp_path):
 674 |                     os.remove(temp_path)
 675 |         
 676 |         except Exception as e:
 677 |             logger.error(f"Error extracting tables from PDF: {str(e)}")
 678 |             # Return minimal text and metadata in case of failure
 679 |             return f"[TABLE EXTRACTION ERROR for {file_name}: {str(e)}]", {
 680 |                 "source": f"google-drive://{file_id}",
 681 |                 "file_id": file_id,
 682 |                 "file_name": file_name,
 683 |                 "mime_type": mime_type,
 684 |                 "extraction_error": str(e)
 685 |             }
 686 |     
 687 |     def process_image_analysis(self, image_data: bytes, file_name: str, file_id: str, mime_type: str) -> Tuple[str, Dict[str, Any]]:
 688 |         """Analyze image content and extract descriptive text.
 689 |         
 690 |         Args:
 691 |             image_data: Raw binary image data
 692 |             file_name: Name of the file
 693 |             file_id: ID of the file
 694 |             mime_type: MIME type of the file
 695 |             
 696 |         Returns:
 697 |             Tuple of (image_description, metadata with image analysis)
 698 |         """
 699 |         try:
 700 |             import io
 701 |             import cv2
 702 |             import numpy as np
 703 |             from PIL import Image
 704 |             
 705 |             # Create metadata
 706 |             metadata = {
 707 |                 "source": f"google-drive://{file_id}",
 708 |                 "file_id": file_id,
 709 |                 "file_name": file_name,
 710 |                 "mime_type": mime_type,
 711 |                 "extraction_method": "image_analysis",
 712 |                 "content_type": "image"
 713 |             }
 714 |             
 715 |             # Convert binary data to PIL Image
 716 |             image = Image.open(io.BytesIO(image_data))
 717 |             
 718 |             # Save image dimensions and format in metadata
 719 |             metadata["image_width"] = image.width
 720 |             metadata["image_height"] = image.height
 721 |             metadata["image_format"] = image.format
 722 |             metadata["image_mode"] = image.mode
 723 |             
 724 |             # Convert to OpenCV format for analysis
 725 |             img_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
 726 |             
 727 |             # Extract basic image features
 728 |             # Color analysis
 729 |             hsv = cv2.cvtColor(img_cv, cv2.COLOR_BGR2HSV)
 730 |             color_data = {
 731 |                 "dominant_hue": np.median(hsv[:,:,0]),
 732 |                 "dominant_saturation": np.median(hsv[:,:,1]),
 733 |                 "dominant_value": np.median(hsv[:,:,2])
 734 |             }
 735 |             metadata["color_analysis"] = color_data
 736 |             
 737 |             # Edge detection for complexity estimation
 738 |             gray = cv2.cvtColor(img_cv, cv2.COLOR_BGR2GRAY)
 739 |             edges = cv2.Canny(gray, 100, 200)
 740 |             edge_ratio = np.sum(edges > 0) / (edges.shape[0] * edges.shape[1])
 741 |             metadata["edge_complexity"] = edge_ratio
 742 |             
 743 |             # Try to detect faces if present
 744 |             try:
 745 |                 face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')
 746 |                 faces = face_cascade.detectMultiScale(gray, 1.1, 4)
 747 |                 
 748 |                 if len(faces) > 0:
 749 |                     metadata["faces_detected"] = len(faces)
 750 |                     face_data = []
 751 |                     for (x, y, w, h) in faces:
 752 |                         face_data.append({"x": int(x), "y": int(y), "width": int(w), "height": int(h)})
 753 |                     metadata["face_locations"] = face_data
 754 |                     
 755 |                     # Generate description with face info
 756 |                     description = f"Image contains {len(faces)} human face(s). "
 757 |                 else:
 758 |                     description = "No human faces detected in the image. "
 759 |             except Exception as face_error:
 760 |                 logger.warning(f"Face detection error: {str(face_error)}")
 761 |                 description = "Image content analysis. "
 762 |             
 763 |             # Add file properties
 764 |             description += f"Image dimensions: {image.width}x{image.height}. "
 765 |             
 766 |             # Add complexity assessment
 767 |             if edge_ratio > 0.1:
 768 |                 description += "The image appears to be complex with many details. "
 769 |             else:
 770 |                 description += "The image appears to be relatively simple without many details. "
 771 |             
 772 |             # Color assessment
 773 |             saturation = color_data["dominant_saturation"]
 774 |             if saturation > 100:
 775 |                 description += "Colors are vivid and saturated. "
 776 |             else:
 777 |                 description += "Colors are muted or less saturated. "
 778 |                 
 779 |             brightness = color_data["dominant_value"]
 780 |             if brightness > 150:
 781 |                 description += "Overall brightness is high. "
 782 |             elif brightness < 70:
 783 |                 description += "Overall brightness is low (dark image). "
 784 |             else:
 785 |                 description += "Image has moderate brightness. "
 786 |             
 787 |             logger.info(f"Successfully analyzed image content for: {file_name}")
 788 |             
 789 |             return description, metadata
 790 |         
 791 |         except Exception as e:
 792 |             logger.error(f"Error analyzing image: {str(e)}")
 793 |             # Return minimal text and metadata in case of failure
 794 |             return f"[IMAGE ANALYSIS ERROR for {file_name}: {str(e)}]", {
 795 |                 "source": f"google-drive://{file_id}",
 796 |                 "file_id": file_id,
 797 |                 "file_name": file_name,
 798 |                 "mime_type": mime_type,
 799 |                 "extraction_error": str(e)
 800 |             }
 801 |     
 802 |     def process_document_with_enhanced_media(self, document: Document) -> Tuple[Optional[str], Optional[Dict[str, Any]]]:
 803 |         """Process a document with enhanced media extraction (OCR, tables, images).
 804 |         
 805 |         This enhances the standard document processing with special handling for:
 806 |         1. Images - using OCR and image analysis
 807 |         2. PDFs - with table extraction
 808 |         3. Other media - with appropriate extraction methods
 809 |         
 810 |         Args:
 811 |             document: LangChain document object
 812 |             
 813 |         Returns:
 814 |             Tuple of (text, metadata)
 815 |         """
 816 |         try:
 817 |             # Extract the base metadata
 818 |             base_metadata = document.metadata
 819 |             
 820 |             # Get file info
 821 |             file_id = base_metadata.get('file_id', None)
 822 |             if not file_id:
 823 |                 # Try to extract from source
 824 |                 source = base_metadata.get('source', '')
 825 |                 if '://' in source:
 826 |                     file_id = source.split('://')[-1]
 827 |                     base_metadata['file_id'] = file_id
 828 |                     
 829 |             file_name = base_metadata.get('file_name', 'Unknown')
 830 |             mime_type = base_metadata.get('mime_type', 'Unknown')
 831 |             
 832 |             # Check if this document needs enhanced processing
 833 |             is_image = mime_type.startswith('image/')
 834 |             is_pdf = mime_type == 'application/pdf'
 835 |             needs_ocr = is_image or 'extracted_text' in base_metadata
 836 |             needs_table_extraction = is_pdf
 837 |             needs_image_analysis = is_image
 838 |             
 839 |             # If no special processing needed, use standard processing
 840 |             if not (needs_ocr or needs_table_extraction or needs_image_analysis):
 841 |                 return self.process_document(document)
 842 |                 
 843 |             # Need to download the file for enhanced processing
 844 |             if file_id:
 845 |                 try:
 846 |                     # Get service
 847 |                     service = self.google_auth.get_drive_service()
 848 |                     
 849 |                     # Download the file content
 850 |                     from googleapiclient.http import MediaIoBaseDownload
 851 |                     import io
 852 |                     
 853 |                     request = service.files().get_media(fileId=file_id)
 854 |                     file_content = io.BytesIO()
 855 |                     downloader = MediaIoBaseDownload(file_content, request)
 856 |                     done = False
 857 |                     while not done:
 858 |                         status, done = downloader.next_chunk()
 859 |                     
 860 |                     # Reset the pointer to the beginning
 861 |                     file_content.seek(0)
 862 |                     file_data = file_content.read()
 863 |                     
 864 |                     # Process based on file type
 865 |                     if needs_ocr and is_image:
 866 |                         text, metadata = self.process_image_ocr(file_data, file_name, file_id, mime_type)
 867 |                         
 868 |                         # If requested, also perform image analysis
 869 |                         if needs_image_analysis:
 870 |                             analysis_text, analysis_metadata = self.process_image_analysis(file_data, file_name, file_id, mime_type)
 871 |                             # Merge text and metadata
 872 |                             text = f"{text}\n\n[IMAGE ANALYSIS]:\n{analysis_text}"
 873 |                             # Combine metadata but preserve extraction method as OCR
 874 |                             for key, value in analysis_metadata.items():
 875 |                                 if key not in metadata and key != "extraction_method":
 876 |                                     metadata[key] = value
 877 |                                     
 878 |                         # Add enhanced processing flag
 879 |                         metadata["enhanced_processing"] = True
 880 |                         return text, metadata
 881 |                         
 882 |                     elif needs_image_analysis:
 883 |                         text, metadata = self.process_image_analysis(file_data, file_name, file_id, mime_type)
 884 |                         # Add enhanced processing flag
 885 |                         metadata["enhanced_processing"] = True
 886 |                         return text, metadata
 887 |                         
 888 |                     elif needs_table_extraction and is_pdf:
 889 |                         # First extract tables
 890 |                         tables_text, tables_metadata = self.extract_tables_from_pdf(file_data, file_name, file_id, mime_type)
 891 |                         
 892 |                         # Now process the normal text content
 893 |                         original_text = document.page_content
 894 |                         
 895 |                         # Combine the texts with clear separation
 896 |                         combined_text = f"{original_text}\n\n[EXTRACTED TABLES]:\n{tables_text}"
 897 |                         
 898 |                         # Combine metadata
 899 |                         combined_metadata = base_metadata.copy()
 900 |                         for key, value in tables_metadata.items():
 901 |                             if key not in combined_metadata:
 902 |                                 combined_metadata[key] = value
 903 |                                 
 904 |                         # Further process the combined metadata
 905 |                         combined_metadata["enhanced_processing"] = True
 906 |                         combined_metadata["content_type"] = "pdf_with_tables"
 907 |                         
 908 |                         # Process the combined metadata through the standard pipeline
 909 |                         text, metadata = self.process_document(Document(
 910 |                             page_content=combined_text,
 911 |                             metadata=combined_metadata
 912 |                         ))
 913 |                         return text, metadata
 914 |                     
 915 |                 except Exception as download_error:
 916 |                     logger.error(f"Error downloading file for enhanced processing: {str(download_error)}")
 917 |                     # Fall back to standard processing if download failed
 918 |                     return self.process_document(document)
 919 |             
 920 |             # If we get here, use standard processing
 921 |             return self.process_document(document)
 922 |             
 923 |         except Exception as e:
 924 |             logger.error(f"Error in enhanced media processing: {str(e)}")
 925 |             return self.process_document(document)  # Fall back to standard processing
 926 | 
 927 |     def ingest(
 928 |         self, 
 929 |         folder_id: Optional[str] = None,
 930 |         file_ids: Optional[List[str]] = None,
 931 |         chunk_size: int = 1000,
 932 |         chunk_overlap: int = 200,
 933 |         file_types: Optional[List[str]] = None,
 934 |         recursive: bool = False,
 935 |         load_auth: bool = False,
 936 |         load_extended_metadata: bool = True,
 937 |         custom_query: Optional[str] = None,
 938 |         max_results: Optional[int] = None,
 939 |         file_loader_cls: Optional[Type] = None,
 940 |         file_loader_kwargs: Optional[Dict[str, Any]] = None,
 941 |         return_stats: bool = False,
 942 |         respect_content_boundaries: bool = True,
 943 |         content_aware_splitting: bool = True,
 944 |         skip_unchanged_documents: bool = False,
 945 |         enable_enhanced_media_processing: bool = False
 946 |     ) -> Union[None, Tuple[int, int, Dict[str, Any], Dict[str, int]]]:
 947 |         """Ingest documents from Google Drive into the vector store.
 948 |         
 949 |         Args:
 950 |             folder_id: ID of the Google Drive folder
 951 |             file_ids: List of specific file IDs to load (optional)
 952 |             chunk_size: Size of each text chunk
 953 |             chunk_overlap: Overlap between chunks
 954 |             file_types: List of file extensions to include
 955 |             recursive: Whether to recursively search subfolders
 956 |             load_auth: Whether to load auth identities
 957 |             load_extended_metadata: Whether to load extended metadata
 958 |             custom_query: Custom query string for Google Drive search
 959 |             max_results: Maximum number of results to return
 960 |             file_loader_cls: Optional file loader class for non-Google document types
 961 |             file_loader_kwargs: Optional kwargs for the file loader
 962 |             return_stats: Whether to return statistics about the ingestion
 963 |             respect_content_boundaries: Whether to respect content boundaries
 964 |             content_aware_splitting: Whether to use content-aware splitting
 965 |             skip_unchanged_documents: Whether to skip unchanged documents
 966 |             enable_enhanced_media_processing: Whether to enable enhanced media processing
 967 |             
 968 |         Returns:
 969 |             If return_stats is True, returns a tuple of:
 970 |             - Number of documents processed
 971 |             - Number of chunks generated
 972 |             - Sample metadata dictionary
 973 |             - Version statistics (new, updated, unchanged)
 974 |             Otherwise returns None
 975 |         """
 976 |         # Load documents
 977 |         documents = self.load_documents_from_drive(
 978 |             folder_id=folder_id,
 979 |             file_ids=file_ids,
 980 |             file_types=file_types,
 981 |             recursive=recursive,
 982 |             load_auth=load_auth,
 983 |             load_extended_metadata=load_extended_metadata,
 984 |             custom_query=custom_query,
 985 |             max_results=max_results,
 986 |             file_loader_cls=file_loader_cls,
 987 |             file_loader_kwargs=file_loader_kwargs
 988 |         )
 989 |         
 990 |         if not documents:
 991 |             logger.warning("No documents found. Check the folder ID and permissions.")
 992 |             if return_stats:
 993 |                 return 0, 0, {}, {"new": 0, "updated": 0, "unchanged": 0}
 994 |             return
 995 |         
 996 |         # Process documents if we have any
 997 |         total_docs = len(documents)
 998 |         if total_docs > 0:
 999 |             logger.info(f"Processing {total_docs} documents")
1000 |             
1001 |             # Process documents in parallel with timeout protection
1002 |             from rag.utils import process_documents_parallel
1003 |             
1004 |             # Set a shorter timeout per document to prevent hanging
1005 |             timeout_per_doc = 45  # 45 seconds per document
1006 |             
1007 |             # Select the appropriate document processing function based on whether enhanced media processing is enabled
1008 |             if enable_enhanced_media_processing:
1009 |                 logger.info("Using enhanced media processing (OCR, tables, image analysis)")
1010 |                 doc_processor = self.process_document_with_enhanced_media
1011 |             else:
1012 |                 logger.info("Using standard document processing")
1013 |                 doc_processor = self.process_document
1014 |             
1015 |             # Process documents in parallel
1016 |             processed_docs = process_documents_parallel(
1017 |                 doc_processor, 
1018 |                 documents, 
1019 |                 max_workers=4,  # Use 4 workers max
1020 |                 timeout_per_doc=timeout_per_doc
1021 |             )
1022 |             
1023 |             # Filter out None results from timeouts
1024 |             processed_docs = [doc for doc in processed_docs if doc[0] is not None]
1025 |             
1026 |             # Now split into chunks using RecursiveCharacterTextSplitter
1027 |             from langchain_text_splitters import RecursiveCharacterTextSplitter
1028 |             
1029 |             # Create text splitter with provided parameters
1030 |             text_splitter = RecursiveCharacterTextSplitter(
1031 |                 chunk_size=chunk_size,
1032 |                 chunk_overlap=chunk_overlap,
1033 |                 separators=["\n\n", "\n", ". ", " ", ""],
1034 |                 keep_separator=respect_content_boundaries
1035 |             )
1036 |             
1037 |             # Process each document and prepare for the vector store
1038 |             chunked_texts = []
1039 |             chunked_metadatas = []
1040 |             chunked_ids = []
1041 |             
1042 |             for text, metadata in processed_docs:
1043 |                 if not text:
1044 |                     logger.warning(f"Skipping document with empty content: {metadata.get('file_name', 'unknown')}")
1045 |                     continue
1046 |                 
1047 |                 # Generate a unique ID for the document if not present
1048 |                 doc_id = metadata.get('file_id', str(uuid.uuid4()))
1049 |                 
1050 |                 # Split text into chunks
1051 |                 doc_chunks = text_splitter.split_text(text)
1052 |                 
1053 |                 for j, chunk in enumerate(doc_chunks):
1054 |                     chunk_metadata = metadata.copy()
1055 |                     chunk_metadata['chunk'] = j
1056 |                     chunk_metadata['parent_id'] = doc_id
1057 |                     chunk_id = f"{doc_id}-chunk-{j}"
1058 |                     
1059 |                     chunked_texts.append(chunk)
1060 |                     chunked_metadatas.append(chunk_metadata)
1061 |                     chunked_ids.append(chunk_id)
1062 |             
1063 |             logger.info(f"Created {len(chunked_texts)} chunks from {len(processed_docs)} documents")
1064 |             
1065 |             # Track version statistics
1066 |             version_stats = {"new": 0, "updated": 0, "unchanged": 0}
1067 |             
1068 |             # Store in vector database
1069 |             logger.info(f"Passing skip_unchanged_documents={skip_unchanged_documents} to _store_in_supabase")
1070 |             self._store_in_supabase(chunked_texts, chunked_metadatas, chunked_ids, version_stats, skip_unchanged_documents)
1071 |             
1072 |             # Return statistics if requested
1073 |             if return_stats:
1074 |                 # Get a sample metadata for display
1075 |                 sample_metadata = chunked_metadatas[0] if chunked_metadatas else {}
1076 |                 
1077 |                 # Count unique parent documents
1078 |                 unique_parent_docs = len(set(meta.get('parent_id') for meta in chunked_metadatas))
1079 |                 
1080 |                 return unique_parent_docs, len(chunked_texts), sample_metadata, version_stats
1081 |         
1082 |         # Return empty statistics if no documents
1083 |         return 0, 0, {}, {"new": 0, "updated": 0, "unchanged": 0}
1084 |             
1085 |     def _store_in_supabase(self, texts, metadatas, ids, version_stats, skip_unchanged_documents=False):
1086 |         """Store documents in Supabase with version tracking.
1087 |         
1088 |         Args:
1089 |             texts: List of document texts
1090 |             metadatas: List of document metadatas
1091 |             ids: List of document IDs
1092 |             version_stats: Dictionary to track version statistics
1093 |             skip_unchanged_documents: Whether to skip processing unchanged documents
1094 |         """
1095 |         logger.info(f"Storing {len(texts)} chunks in Supabase vector store...")
1096 |         logger.info(f"Skip unchanged documents setting: {skip_unchanged_documents}")
1097 |         
1098 |         try:
1099 |             # Get Supabase client if available
1100 |             supabase_client = getattr(self.vectorstore, '_client', None)
1101 |             
1102 |             # Check each document to see if it already exists and needs updating
1103 |             new_texts = []
1104 |             new_metadatas = []
1105 |             new_ids = []
1106 |             update_texts = []
1107 |             update_metadatas = []
1108 |             update_ids = []
1109 |             skip_count = 0
1110 |             
1111 |             # Check each document to see if it already exists and needs updating
1112 |             for i, (text, metadata, doc_id) in enumerate(zip(texts, metadatas, ids)):
1113 |                 # Check if document exists and needs updating
1114 |                 needs_update, existing_metadata, content_changed, metadata_update_only = DocumentProcessor.check_document_version(
1115 |                     doc_id=doc_id,
1116 |                     metadata=metadata,
1117 |                     content=text,
1118 |                     supabase_client=supabase_client,
1119 |                     table_name=self.collection_name
1120 |                 )
1121 |                 
1122 |                 if existing_metadata:
1123 |                     # Document exists
1124 |                     if needs_update:
1125 |                         # Document needs update - increment version
1126 |                         if 'version_info' in existing_metadata:
1127 |                             # Get existing version
1128 |                             version_info = existing_metadata['version_info']
1129 |                             current_version = version_info.get('version', '1.0')
1130 |                             
1131 |                             # Increment version
1132 |                             version_parts = current_version.split('.')
1133 |                             if len(version_parts) >= 2:
1134 |                                 major, minor = version_parts[0], version_parts[1]
1135 |                                 new_minor = str(int(minor) + 1)
1136 |                                 new_version = f"{major}.{new_minor}"
1137 |                             else:
1138 |                                 new_version = f"{current_version}.1"
1139 |                             
1140 |                             # Update version info
1141 |                             new_version_info = {
1142 |                                 'version': new_version,
1143 |                                 'created_at': version_info.get('created_at', datetime.now().isoformat()),
1144 |                                 'updated_at': datetime.now().isoformat(),
1145 |                                 'previous_versions': version_info.get('previous_versions', []) + [{
1146 |                                     'version': current_version,
1147 |                                     'updated_at': version_info.get('updated_at', datetime.now().isoformat()),
1148 |                                     'content_changed': content_changed
1149 |                                 }]
1150 |                             }
1151 |                             
1152 |                             # Update metadata with new version info
1153 |                             metadata['version_info'] = new_version_info
1154 |                             
1155 |                             # Also add a flag indicating if content changed
1156 |                             metadata['content_changed'] = content_changed
1157 |                         
1158 |                         # Add to update list
1159 |                         update_texts.append(text)
1160 |                         update_metadatas.append(metadata)
1161 |                         update_ids.append(doc_id)
1162 |                         
1163 |                         logger.info(f"Document {doc_id} will be updated to version {new_version} (content changed: {content_changed})")
1164 |                     elif metadata_update_only:
1165 |                         # Only metadata changed, like modified_time - we skip re-embedding
1166 |                         # but we want to track it in statistics
1167 |                         if 'metadata_only_skipped' not in version_stats:
1168 |                             version_stats['metadata_only_skipped'] = 0
1169 |                         version_stats['metadata_only_skipped'] += 1
1170 |                         
1171 |                         logger.info(f"Document {doc_id} has metadata changes only, skipping re-embedding")
1172 |                         skip_count += 1
1173 |                     else:
1174 |                         # Document hasn't changed
1175 |                         if skip_unchanged_documents:
1176 |                             # Skip this document as it hasn't changed
1177 |                             skip_count += 1
1178 |                             if 'unchanged' not in version_stats:
1179 |                                 version_stats['unchanged'] = 0
1180 |                             version_stats['unchanged'] += 1
1181 |                             
1182 |                             logger.info(f"Document {doc_id} hasn't changed, skipping")
1183 |                         else:
1184 |                             # Even though unchanged, we'll process it since skip_unchanged_documents is False
1185 |                             update_texts.append(text)
1186 |                             update_metadatas.append(metadata)
1187 |                             update_ids.append(doc_id)
1188 |                             logger.info(f"Document {doc_id} hasn't changed, but processing anyway")
1189 |                 else:
1190 |                     # Document doesn't exist, add to new list
1191 |                     new_texts.append(text)
1192 |                     new_metadatas.append(metadata)
1193 |                     new_ids.append(doc_id)
1194 |             
1195 |             # Insert new documents
1196 |             if new_texts:
1197 |                 logger.info(f"Adding {len(new_texts)} new documents to Supabase")
1198 |                 self.vectorstore.add_texts(
1199 |                     texts=new_texts,
1200 |                     metadatas=new_metadatas,
1201 |                     ids=new_ids
1202 |                 )
1203 |             
1204 |             # Update existing documents
1205 |             if update_texts:
1206 |                 logger.info(f"Updating {len(update_texts)} existing documents in Supabase")
1207 |                 for i, (text, metadata, doc_id) in enumerate(zip(update_texts, update_metadatas, update_ids)):
1208 |                     try:
1209 |                         # Check if content has changed or just metadata
1210 |                         content_changed = metadata.get('content_changed', True)
1211 |                         
1212 |                         # If content_changed is True, we need to delete and re-add
1213 |                         # If it's False (only metadata changed), we can still optimize in future
1214 |                         
1215 |                         # For now, we need to delete and re-add in both cases
1216 |                         # Most vector DBs don't support updating just metadata without regenerating embeddings
1217 |                         
1218 |                         # Delete the existing document
1219 |                         if hasattr(self.vectorstore, 'delete'):
1220 |                             self.vectorstore.delete(doc_id)
1221 |                         elif supabase_client:
1222 |                             supabase_client.table(self.collection_name).delete().eq('id', doc_id).execute()
1223 |                         
1224 |                         # Add the updated document
1225 |                         self.vectorstore.add_texts(
1226 |                             texts=[text],
1227 |                             metadatas=[metadata],
1228 |                             ids=[doc_id]
1229 |                         )
1230 |                         
1231 |                         # if content_changed:
1232 |                         #     logger.info(f"Successfully updated document {doc_id} (version {metadata['version_info']['version']}) - content changed")
1233 |                         # else:
1234 |                         #     logger.info(f"Successfully updated document {doc_id} (version {metadata['version_info']['version']}) - only metadata changed")
1235 |                             
1236 |                         # Track statistics based on content change status
1237 |                         if 'content_changed_stats' not in version_stats:
1238 |                             version_stats['content_changed_stats'] = {'content': 0, 'metadata_only': 0}
1239 |                         
1240 |                         if content_changed:
1241 |                             version_stats['content_changed_stats']['content'] += 1
1242 |                         else:
1243 |                             version_stats['content_changed_stats']['metadata_only'] += 1
1244 |                             
1245 |                     except Exception as e:
1246 |                         logger.error(f"Error updating document {doc_id}: {str(e)}")
1247 |             
1248 |             logger.info(f"Successfully processed {len(texts)} document chunks:")
1249 |             logger.info(f"  - {len(new_texts)} new documents added")
1250 |             
1251 |             # Display specific update statistics if available
1252 |             if 'content_changed_stats' in version_stats:
1253 |                 content_changes = version_stats['content_changed_stats']['content']
1254 |                 metadata_changes = version_stats['content_changed_stats']['metadata_only']
1255 |                 total_updates = content_changes + metadata_changes
1256 |                 
1257 |                 logger.info(f"  - {total_updates} documents updated:")
1258 |                 logger.info(f"    * {content_changes} with content changes")
1259 |                 logger.info(f"    * {metadata_changes} with metadata changes only")
1260 |             else:
1261 |                 logger.info(f"  - {len(update_texts)} documents updated")
1262 |             
1263 |             # Report on skipped documents
1264 |             unchanged_count = version_stats.get('unchanged', skip_count)
1265 |             metadata_only_skipped = version_stats.get('metadata_only_skipped', 0)
1266 |             
1267 |             if metadata_only_skipped > 0:
1268 |                 logger.info(f"  - {unchanged_count - metadata_only_skipped} documents unchanged (no changes)")
1269 |                 logger.info(f"  - {metadata_only_skipped} documents with metadata changes only (skipped re-embedding)")
1270 |             else:
1271 |                 logger.info(f"  - {unchanged_count} documents skipped (unchanged)")
1272 |             
1273 |             # Update version statistics
1274 |             version_stats['new'] = len(new_texts)
1275 |             version_stats['updated'] = len(update_texts)
1276 |             version_stats['unchanged'] = skip_count
1277 |             
1278 |             # You may also want to track metadata-only changes that were skipped
1279 |             # Initialize if not present
1280 |             if 'metadata_only_skipped' not in version_stats:
1281 |                 version_stats['metadata_only_skipped'] = 0
1282 |                 
1283 |             # Add skipped_metadata_changes to statistics display
1284 |             if version_stats.get('metadata_only_skipped', 0) > 0:
1285 |                 logger.info(f"  - {version_stats['metadata_only_skipped']} documents with only metadata changes (skipped re-embedding)")
1286 |         
1287 |         except Exception as e:
1288 |             logger.error(f"Error storing documents in Supabase: {str(e)}")
1289 |             raise
1290 | 
1291 | # Enhanced document ingestion with improved extraction
1292 | def ingest_documents_enhanced(
1293 |     source_path: str, 
1294 |     store: str = "supabase",
1295 |     collection_name: str = "langchain_docs",
1296 |     chunk_size: int = 1000,
1297 |     chunk_overlap: int = 200
1298 | ) -> None:
1299 |     """Ingest documents with enhanced extraction.
1300 |     
1301 |     Args:
1302 |         source_path: Path to the document or directory to ingest
1303 |         store: Vector store type ("supabase", "chroma", etc.)
1304 |         collection_name: Collection name in the vector store
1305 |         chunk_size: Size of text chunks
1306 |         chunk_overlap: Overlap between chunks
1307 |     """
1308 |     pass  # Implementation pending


--------------------------------------------------------------------------------
/rag/query.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """Query functionality for RAG systems."""
  3 | 
  4 | import os
  5 | import logging
  6 | from typing import List, Dict, Any, Optional, Union
  7 | from dotenv import load_dotenv
  8 | 
  9 | # Import core modules
 10 | from .store import SupabaseStore
 11 | 
 12 | # Configure logging
 13 | logging.basicConfig(
 14 |     level=logging.INFO,
 15 |     format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
 16 | )
 17 | logger = logging.getLogger(__name__)
 18 | 
 19 | # Load environment variables
 20 | load_dotenv()
 21 | 
 22 | def query_langchain(
 23 |     query: str,
 24 |     collection_name: str = "langchain_docs",
 25 |     top_k: int = 5,
 26 |     metadata_filter: Optional[Dict[str, Any]] = None,
 27 |     user_email: Optional[str] = None,
 28 |     enforce_security: bool = False
 29 | ) -> Dict[str, Any]:
 30 |     """Query documents using LangChain.
 31 |     
 32 |     Args:
 33 |         query: The query text
 34 |         collection_name: The collection name to query
 35 |         top_k: Maximum number of results to return
 36 |         metadata_filter: Metadata filters to apply
 37 |         user_email: User email for security filtering
 38 |         enforce_security: Whether to enforce security filtering
 39 |         
 40 |     Returns:
 41 |         Dictionary with query results
 42 |     """
 43 |     logger.info(f"Querying LangChain collection '{collection_name}' for: {query}")
 44 |     
 45 |     try:
 46 |         # Import required LangChain modules
 47 |         from langchain_openai import OpenAIEmbeddings
 48 |         
 49 |         # Initialize embedding model
 50 |         embeddings = OpenAIEmbeddings()
 51 |         
 52 |         # Initialize Supabase store
 53 |         store = SupabaseStore(collection_name=collection_name)
 54 |         vectorstore = store.get_langchain_store(embeddings)
 55 |         
 56 |         # Apply security filtering if requested
 57 |         if enforce_security:
 58 |             logger.info(f"Enforcing security filters for user: {user_email}")
 59 |             security_filter = build_security_filter(user_email)
 60 |             
 61 |             # Combine security filter with any existing metadata filter
 62 |             if metadata_filter:
 63 |                 combined_filter = {**metadata_filter, **security_filter}
 64 |             else:
 65 |                 combined_filter = security_filter
 66 |                 
 67 |             metadata_filter = combined_filter
 68 |         
 69 |         # Log the filter being used
 70 |         if metadata_filter:
 71 |             logger.info(f"Using metadata filter: {metadata_filter}")
 72 |         
 73 |         # Execute the query
 74 |         if metadata_filter:
 75 |             # Search with metadata filter
 76 |             try:
 77 |                 logger.info(f"Attempting similarity_search_with_score with k={top_k}")
 78 |                 results_with_scores = vectorstore.similarity_search_with_score(
 79 |                     query=query,
 80 |                     k=top_k,
 81 |                     filter=metadata_filter
 82 |                 )
 83 |                 
 84 |                 # Process results
 85 |                 results = []
 86 |                 for doc, score in results_with_scores:
 87 |                     results.append({
 88 |                         "content": doc.page_content,
 89 |                         "metadata": doc.metadata,
 90 |                         "similarity": score
 91 |                     })
 92 |                     
 93 |                 logger.info(f"Found {len(results)} results with metadata filtering")
 94 |             except Exception as e:
 95 |                 logger.error(f"Error with similarity_search_with_score: {str(e)}")
 96 |                 # Log the traceback for more details
 97 |                 import traceback
 98 |                 logger.error(f"Traceback: {traceback.format_exc()}")
 99 |                 logger.info("Falling back to similarity_search")
100 |                 
101 |                 # Fall back to regular search without scoring
102 |                 docs = vectorstore.similarity_search(
103 |                     query=query,
104 |                     k=top_k,
105 |                     filter=metadata_filter
106 |                 )
107 |                 
108 |                 # Process results without scores
109 |                 results = []
110 |                 for doc in docs:
111 |                     results.append({
112 |                         "content": doc.page_content,
113 |                         "metadata": doc.metadata,
114 |                         "similarity": None  # No similarity score available
115 |                     })
116 |                     
117 |                 logger.info(f"Found {len(results)} results with fallback method")
118 |         else:
119 |             # Search without metadata filter
120 |             try:
121 |                 logger.info(f"Attempting similarity_search_with_score with k={top_k}")
122 |                 results_with_scores = vectorstore.similarity_search_with_score(
123 |                     query=query,
124 |                     k=top_k
125 |                 )
126 |                 
127 |                 # Process results
128 |                 results = []
129 |                 for doc, score in results_with_scores:
130 |                     results.append({
131 |                         "content": doc.page_content,
132 |                         "metadata": doc.metadata,
133 |                         "similarity": score
134 |                     })
135 |                     
136 |                 logger.info(f"Found {len(results)} results without filtering")
137 |             except Exception as e:
138 |                 logger.error(f"Error with similarity_search_with_score: {str(e)}")
139 |                 # Log the traceback for more details
140 |                 import traceback
141 |                 logger.error(f"Traceback: {traceback.format_exc()}")
142 |                 logger.info("Falling back to similarity_search")
143 |                 
144 |                 # Fall back to regular search without scoring
145 |                 docs = vectorstore.similarity_search(
146 |                     query=query,
147 |                     k=top_k
148 |                 )
149 |                 
150 |                 # Process results without scores
151 |                 results = []
152 |                 for doc in docs:
153 |                     results.append({
154 |                         "content": doc.page_content,
155 |                         "metadata": doc.metadata,
156 |                         "similarity": None  # No similarity score available
157 |                     })
158 |                     
159 |                 logger.info(f"Found {len(results)} results with fallback method")
160 |         
161 |         # Return results with additional information
162 |         return {
163 |             "query": query,
164 |             "collection": collection_name,
165 |             "results": results,
166 |             "count": len(results),
167 |             "filter_applied": metadata_filter is not None,
168 |             "security_filtered": enforce_security
169 |         }
170 |         
171 |     except Exception as e:
172 |         logger.error(f"Error querying documents with LangChain: {str(e)}")
173 |         import traceback
174 |         logger.error(f"Traceback: {traceback.format_exc()}")
175 |         
176 |         return {
177 |             "query": query,
178 |             "collection": collection_name,
179 |             "results": [],
180 |             "count": 0,
181 |             "error": str(e)
182 |         }
183 | 
184 | def build_security_filter(user_email: Optional[str] = None) -> Dict[str, Any]:
185 |     """Build security filter based on user email.
186 |     
187 |     Args:
188 |         user_email: User email for security filtering
189 |         
190 |     Returns:
191 |         Security filter dictionary
192 |     """
193 |     # If no user email is provided, only return public documents
194 |     if not user_email:
195 |         return {"access_summary.is_public": True}
196 |     
197 |     # For a specific user, return documents they have access to
198 |     # This includes:
199 |     # 1. Documents they own
200 |     # 2. Documents they have edit access to
201 |     # 3. Documents they have view access to
202 |     # 4. Documents they have comment access to
203 |     # 5. Public documents
204 |     
205 |     # Use the $or operator to match any of these conditions
206 |     return {
207 |         "$or": [
208 |             {"access_summary.owners": user_email},
209 |             {"access_summary.editors": user_email},
210 |             {"access_summary.viewers": user_email},
211 |             {"access_summary.commenters": user_email},
212 |             {"access_summary.is_public": True}
213 |         ]
214 |     }
215 | 
216 | def format_results(results: Dict[str, Any], max_content_length: int = 200) -> str:
217 |     """Format query results for display.
218 |     
219 |     Args:
220 |         results: Query results dictionary
221 |         max_content_length: Maximum content length to show
222 |         
223 |     Returns:
224 |         Formatted string
225 |     """
226 |     if not results or not results.get("results"):
227 |         return "No results found."
228 |     
229 |     formatted = f"Found {results['count']} results for query: '{results['query']}'\n\n"
230 |     
231 |     for i, item in enumerate(results["results"]):
232 |         content = item["content"]
233 |         if len(content) > max_content_length:
234 |             content = content[:max_content_length] + "..."
235 |             
236 |         metadata = item["metadata"]
237 |         similarity = item.get("similarity")
238 |         
239 |         formatted += f"Result {i+1}:\n"
240 |         formatted += f"Content: {content}\n"
241 |         
242 |         if similarity is not None:
243 |             formatted += f"Similarity: {similarity:.4f}\n"
244 |             
245 |         # Show key metadata fields
246 |         if "title" in metadata:
247 |             formatted += f"Title: {metadata['title']}\n"
248 |         if "file_name" in metadata:
249 |             formatted += f"File: {metadata['file_name']}\n"
250 |         
251 |         formatted += "\n"
252 |     
253 |     return formatted


--------------------------------------------------------------------------------
/rag/store.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """Vector store interface for different vector database implementations."""
  3 | 
  4 | import os
  5 | import logging
  6 | from typing import List, Dict, Any, Optional, Union
  7 | from dotenv import load_dotenv
  8 | 
  9 | # Configure logging
 10 | logging.basicConfig(
 11 |     level=logging.INFO,
 12 |     format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
 13 | )
 14 | logger = logging.getLogger(__name__)
 15 | 
 16 | # Load environment variables
 17 | load_dotenv()
 18 | 
 19 | class VectorStoreInterface:
 20 |     """Abstract interface for vector database operations."""
 21 |     
 22 |     def __init__(self, collection_name: str):
 23 |         """Initialize a vector store interface.
 24 |         
 25 |         Args:
 26 |             collection_name: Name of the collection to use
 27 |         """
 28 |         self.collection_name = collection_name
 29 |     
 30 |     def add_documents(self, documents, metadatas, ids):
 31 |         """Add documents to the vector store.
 32 |         
 33 |         Args:
 34 |             documents: List of document texts
 35 |             metadatas: List of metadata dictionaries
 36 |             ids: List of unique IDs for the documents
 37 |         """
 38 |         raise NotImplementedError("Subclasses must implement this method")
 39 |     
 40 |     def query(self, query_text, n_results=5, where=None):
 41 |         """Query the vector store for similar documents.
 42 |         
 43 |         Args:
 44 |             query_text: Text to query
 45 |             n_results: Number of results to return
 46 |             where: Filter query based on metadata
 47 |             
 48 |         Returns:
 49 |             Dictionary with query results
 50 |         """
 51 |         raise NotImplementedError("Subclasses must implement this method")
 52 |     
 53 |     def get_collection_stats(self):
 54 |         """Get statistics about the collection.
 55 |         
 56 |         Returns:
 57 |             Dictionary with count of documents and other stats
 58 |         """
 59 |         raise NotImplementedError("Subclasses must implement this method")
 60 | 
 61 | class SupabaseStore(VectorStoreInterface):
 62 |     """A wrapper for Supabase vector database operations."""
 63 |     
 64 |     def __init__(
 65 |         self, 
 66 |         collection_name: str = "documents",
 67 |         create_table_if_not_exists: bool = True
 68 |     ):
 69 |         """Initialize the Supabase vector store.
 70 |         
 71 |         Args:
 72 |             collection_name: Name of the collection/table to use
 73 |             create_table_if_not_exists: Whether to create the table if it doesn't exist
 74 |         """
 75 |         super().__init__(collection_name)
 76 |         self.table_name = collection_name
 77 |         
 78 |         # Get Supabase credentials from environment
 79 |         self.supabase_url = os.getenv("SUPABASE_URL", "").strip("'")
 80 |         self.supabase_key = os.getenv("SUPABASE_SERVICE_KEY", "").strip("'")
 81 |         
 82 |         # Try to get postgres URL directly or construct it
 83 |         self.postgres_url = os.getenv("SUPABASE_POSTGRES_URL", "").strip("'")
 84 |         
 85 |         # If postgres URL is not provided, try to construct it
 86 |         if not self.postgres_url:
 87 |             project_ref = os.getenv("SUPABASE_PROJECT_REF", "").strip("'")
 88 |             if not project_ref and self.supabase_url:
 89 |                 # Extract project ref from supabase URL if available
 90 |                 try:
 91 |                     project_ref = self.supabase_url.replace("https://", "").replace("http://", "").split(".")[0]
 92 |                 except:
 93 |                     pass
 94 |             
 95 |             if project_ref:
 96 |                 # We need a password for postgres connection
 97 |                 # This is a fallback assuming default postgres user and trying to extract password from service key
 98 |                 # This is a heuristic and may not always work
 99 |                 password = "postgres"  # Default fallback
100 |                 try:
101 |                     # Try to extract some unique string from the service key to use as password
102 |                     if self.supabase_key and len(self.supabase_key) > 20:
103 |                         # Use part of the service key as password
104 |                         password = self.supabase_key.split(".")[1][:16]
105 |                 except:
106 |                     logger.warning("Could not extract password from service key, using default")
107 |                 
108 |                 # Construct postgres URL
109 |                 self.postgres_url = f"postgres://postgres:{password}@db.{project_ref}.supabase.co:5432/postgres"
110 |                 logger.info(f"Dynamically constructed postgres URL using project ref: {project_ref}")
111 |         
112 |         if not self.supabase_url or not self.supabase_key:
113 |             raise ValueError(
114 |                 "Supabase credentials not found. Please set SUPABASE_URL and "
115 |                 "SUPABASE_SERVICE_KEY environment variables."
116 |             )
117 |         
118 |         try:
119 |             # Import and initialize Supabase client
120 |             from supabase import create_client, Client
121 |             
122 |             logger.info(f"Initializing Supabase client with URL: {self.supabase_url}")
123 |             
124 |             # Check URL format and resolve hostname
125 |             import socket
126 |             url_hostname = self.supabase_url.replace("https://", "").replace("http://", "").split("/")[0]
127 |             try:
128 |                 socket.gethostbyname(url_hostname)
129 |                 logger.info(f"Successfully resolved host: {url_hostname}")
130 |             except socket.gaierror:
131 |                 logger.warning(f"Could not resolve host: {url_hostname}")
132 |                 # Try with project ID instead
133 |                 project_id = os.getenv("SUPABASE_PROJECT_ID", "").strip("'")
134 |                 if project_id:
135 |                     new_url = f"https://{project_id}.supabase.co"
136 |                     try:
137 |                         new_hostname = new_url.replace("https://", "").replace("http://", "").split("/")[0]
138 |                         socket.gethostbyname(new_hostname)
139 |                         logger.info(f"Successfully resolved alternative host: {new_hostname}")
140 |                         
141 |                         # Update the URL
142 |                         self.supabase_url = new_url
143 |                         logger.info(f"Updated URL to: {self.supabase_url}")
144 |                     except socket.gaierror:
145 |                         logger.error(f"Could not resolve alternative host either: {new_hostname}")
146 |                 else:
147 |                     logger.error("No SUPABASE_PROJECT_ID available to try as alternative")
148 |             
149 |             self.client = create_client(self.supabase_url, self.supabase_key)
150 |             logger.info("Successfully initialized Supabase client")
151 |         except Exception as e:
152 |             logger.error(f"Failed to initialize Supabase client: {str(e)}")
153 |             raise
154 |     
155 |     def get_langchain_store(self, embedding_function):
156 |         """Get a LangChain Supabase vector store.
157 |         
158 |         Args:
159 |             embedding_function: The embedding function to use
160 |             
161 |         Returns:
162 |             A LangChain SupabaseVectorStore instance
163 |         """
164 |         from langchain_community.vectorstores import SupabaseVectorStore
165 |         
166 |         try:
167 |             # Determine the appropriate query function name based on table_name
168 |             query_name = "match_documents"
169 |             if self.table_name == "langchain_example":
170 |                 query_name = "match_langchain_documents"
171 |             elif self.table_name == "langchain_docs":
172 |                 query_name = "match_langchain_docs"
173 |             
174 |             # Create the vector store
175 |             store = SupabaseVectorStore(
176 |                 client=self.client,
177 |                 embedding=embedding_function,
178 |                 table_name=self.table_name,
179 |                 query_name=query_name
180 |             )
181 |             
182 |             # Check available search methods for debugging
183 |             search_methods = []
184 |             if hasattr(store, 'similarity_search'):
185 |                 search_methods.append('similarity_search')
186 |             if hasattr(store, 'similarity_search_with_score'):
187 |                 search_methods.append('similarity_search_with_score')
188 |             if hasattr(store, 'search'):
189 |                 search_methods.append('search')
190 |             if hasattr(store, 'get_relevant_documents'):
191 |                 search_methods.append('get_relevant_documents')
192 |                 
193 |             logger.info(f"Available search methods: {', '.join(search_methods)}")
194 |             
195 |             # Add custom similarity_search_with_score method
196 |             logger.info("Adding custom similarity_search_with_score method")
197 |             def custom_similarity_search_with_score(self, query, k=4, filter=None):
198 |                 """Custom similarity_search_with_score implementation."""
199 |                 logger.info(f"Using custom similarity_search_with_score with k={k}")
200 |                 try:
201 |                     # Generate embedding for the query
202 |                     query_embedding = self._embedding.embed_query(query)
203 |                     
204 |                     # Define parameters with both match_count and max_results
205 |                     params = {
206 |                         "query_embedding": query_embedding,
207 |                         "match_count": k,
208 |                         "max_results": k
209 |                     }
210 |                     
211 |                     if filter is not None:
212 |                         logger.warning("Filters not fully implemented in custom method")
213 |                     
214 |                     # Make the query
215 |                     logger.debug(f"Executing RPC with params: {params}")
216 |                     response = self._client.rpc(self.query_name, params).execute()
217 |                     
218 |                     if not response.data:
219 |                         return []
220 |                     
221 |                     # Process results
222 |                     from langchain_core.documents import Document
223 |                     results = []
224 |                     for result in response.data:
225 |                         document = Document(
226 |                             page_content=result["content"],
227 |                             metadata=result["metadata"]
228 |                         )
229 |                         results.append((document, result["similarity"]))
230 |                     
231 |                     logger.info(f"Custom method returned {len(results)} results with scores")
232 |                     return results
233 |                     
234 |                 except Exception as e:
235 |                     logger.error(f"Error in custom similarity_search_with_score: {str(e)}")
236 |                     import traceback
237 |                     logger.error(f"Traceback: {traceback.format_exc()}")
238 |                     raise
239 |             
240 |             # Add method to the instance
241 |             import types
242 |             store.similarity_search_with_score = types.MethodType(custom_similarity_search_with_score, store)
243 |             
244 |             # Add our own similarity_search method if needed
245 |             if 'similarity_search' not in search_methods:
246 |                 logger.info("Adding custom similarity_search method")
247 |                 def custom_similarity_search(self, query, k=4, filter=None):
248 |                     """Custom similarity_search implementation."""
249 |                     logger.info(f"Using custom similarity_search with k={k}")
250 |                     try:
251 |                         # Use our custom similarity_search_with_score and extract just the documents
252 |                         results_with_scores = self.similarity_search_with_score(query, k=k, filter=filter)
253 |                         return [doc for doc, _ in results_with_scores]
254 |                     except Exception as e:
255 |                         logger.error(f"Error in custom_similarity_search: {str(e)}")
256 |                         
257 |                         # Fallback to original implementation if available
258 |                         if hasattr(self, 'similarity_search_by_vector'):
259 |                             logger.info("Falling back to similarity_search_by_vector")
260 |                             query_embedding = self._embedding.embed_query(query)
261 |                             return self.similarity_search_by_vector(query_embedding, k=k, filter=filter)
262 |                         else:
263 |                             logger.warning("No suitable search method found")
264 |                             return []
265 |                 
266 |                 # Add method to the instance
267 |                 store.similarity_search = types.MethodType(custom_similarity_search, store)
268 |             
269 |             # Log success
270 |             logger.info("Successfully created LangChain Supabase vector store")
271 |             return store
272 |         
273 |         except Exception as e:
274 |             logger.error(f"Failed to create LangChain Supabase vector store: {str(e)}")
275 |             raise
276 |     
277 |     def query_raw(self, query_text: str, match_count: int = 5) -> Dict[str, Any]:
278 |         """Perform a raw query directly using Supabase.
279 |         
280 |         Args:
281 |             query_text: Text to query
282 |             match_count: Number of matches to return
283 |             
284 |         Returns:
285 |             Query results
286 |         """
287 |         # This method needs an embedding function to work properly
288 |         logger.warning("query_raw() method called but is not fully implemented - needs embedding function")
289 |         logger.info("Use LangChain or other interfaces for querying instead")
290 |         
291 |         # Return empty results
292 |         return {"results": [], "message": "Direct query not implemented. Use LangChain interface instead."}
293 |     
294 |     def debug_metadata_retrieval(self, limit: int = 5) -> Dict[str, Any]:
295 |         """Debug function to directly query metadata from Supabase without embeddings.
296 |         
297 |         Args:
298 |             limit: Maximum number of records to return
299 |             
300 |         Returns:
301 |             Dictionary with debug information
302 |         """
303 |         logger.info(f"Attempting direct debug query of table {self.table_name} for metadata inspection")
304 |         
305 |         try:
306 |             # Direct query to get metadata from the table
307 |             response = self.client.table(self.table_name).select("id, metadata").limit(limit).execute()
308 |             
309 |             # Log the structure of the response for debugging
310 |             logger.info(f"Supabase response structure: {type(response)}")
311 |             if hasattr(response, 'data'):
312 |                 logger.info(f"Response data type: {type(response.data)}")
313 |                 logger.info(f"Response data length: {len(response.data)}")
314 |                 
315 |                 if len(response.data) > 0:
316 |                     logger.info(f"First record keys: {response.data[0].keys() if isinstance(response.data[0], dict) else 'not a dict'}")
317 |                     
318 |                     # Check if metadata exists and what type it is
319 |                     if isinstance(response.data[0], dict) and 'metadata' in response.data[0]:
320 |                         metadata = response.data[0]['metadata']
321 |                         logger.info(f"Metadata type: {type(metadata)}")
322 |                         logger.info(f"Metadata content sample: {str(metadata)[:100]}")
323 |                     else:
324 |                         logger.warning("No metadata field found in the first record")
325 |                 
326 |                 return {
327 |                     "status": "success",
328 |                     "records_found": len(response.data),
329 |                     "sample_data": response.data[:limit],
330 |                     "message": "Direct metadata query completed"
331 |                 }
332 |             else:
333 |                 logger.warning("No data attribute in Supabase response")
334 |                 return {
335 |                     "status": "error", 
336 |                     "message": "No data attribute in Supabase response",
337 |                     "raw_response": str(response)
338 |                 }
339 |         
340 |         except Exception as e:
341 |             logger.error(f"Error in debug_metadata_retrieval: {str(e)}")
342 |             return {"status": "error", "message": str(e)}


--------------------------------------------------------------------------------
/rag/utils.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """Common utility functions for document processing and handling."""
  3 | 
  4 | import os
  5 | import uuid
  6 | import logging
  7 | import concurrent.futures
  8 | from typing import List, Dict, Any, Optional, Union, Callable, Tuple
  9 | from datetime import datetime
 10 | import json
 11 | import time
 12 | import ssl
 13 | import socket
 14 | 
 15 | # Configure logging
 16 | logging.basicConfig(
 17 |     level=logging.INFO,
 18 |     format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
 19 | )
 20 | logger = logging.getLogger(__name__)
 21 | 
 22 | class DocumentProcessor:
 23 |     """Utility class for document processing."""
 24 |     
 25 |     @staticmethod
 26 |     def process_documents_parallel(documents, process_func, max_workers=4):
 27 |         """Process multiple documents in parallel.
 28 |         
 29 |         Args:
 30 |             documents: List of documents to process
 31 |             process_func: Function to process each document
 32 |             max_workers: Maximum number of parallel workers
 33 |             
 34 |         Returns:
 35 |             Tuple of (texts, metadatas, ids)
 36 |         """
 37 |         texts = []
 38 |         metadatas = []
 39 |         ids = []
 40 |         
 41 |         logger.info(f"Processing {len(documents)} documents in parallel with {max_workers} workers")
 42 |         
 43 |         # Process documents in parallel
 44 |         with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
 45 |             # Submit all documents for processing
 46 |             future_to_document = {executor.submit(process_func, doc): doc for doc in documents}
 47 |             
 48 |             # Process results as they complete
 49 |             for future in concurrent.futures.as_completed(future_to_document):
 50 |                 document = future_to_document[future]
 51 |                 try:
 52 |                     text, metadata = future.result()
 53 |                     
 54 |                     # Generate a unique ID for the document
 55 |                     if metadata.get('file_id'):
 56 |                         doc_id = metadata['file_id']
 57 |                     else:
 58 |                         doc_id = str(uuid.uuid4())
 59 |                     
 60 |                     # Add to output lists
 61 |                     texts.append(text)
 62 |                     metadatas.append(metadata)
 63 |                     ids.append(doc_id)
 64 |                     
 65 |                 except Exception as e:
 66 |                     logger.error(f"Error processing document: {str(e)}")
 67 |         
 68 |         return texts, metadatas, ids
 69 |     
 70 |     @staticmethod
 71 |     def extract_google_drive_metadata(base_metadata, drive_item=None, auth_info=None):
 72 |         """Extract and standardize Google Drive metadata.
 73 |         
 74 |         Args:
 75 |             base_metadata: Base metadata dictionary
 76 |             drive_item: Google Drive item object
 77 |             auth_info: Authorization information
 78 |             
 79 |         Returns:
 80 |             Standardized metadata dictionary
 81 |         """
 82 |         metadata = base_metadata.copy() if base_metadata else {}
 83 |         
 84 |         # Extract file ID from various possible locations
 85 |         file_id = metadata.get('file_id') or metadata.get('source', '').split('://')[-1] or metadata.get('id', '')
 86 |         
 87 |         # Add standard properties
 88 |         if file_id:
 89 |             metadata['file_id'] = file_id
 90 |         
 91 |         # If we have drive_item, extract additional metadata
 92 |         if drive_item:
 93 |             metadata['source_type'] = 'google_drive'
 94 |             metadata['url'] = f"https://drive.google.com/file/d/{drive_item.get('id')}/view"
 95 |             
 96 |             # Extract basic properties
 97 |             for prop in ['name', 'mimeType', 'createdTime', 'modifiedTime', 'shared', 'starred', 'trashed', 'version', 'size', 'thumbnailLink']:
 98 |                 if prop in drive_item:
 99 |                     # Convert property name to snake_case
100 |                     snake_prop = ''.join(['_' + c.lower() if c.isupper() else c for c in prop]).lstrip('_')
101 |                     metadata[snake_prop] = drive_item[prop]
102 |             
103 |             # Standardize names (id -> file_id, name -> title, etc.)
104 |             if 'name' in metadata and 'title' not in metadata:
105 |                 metadata['title'] = metadata['name']
106 |             
107 |             # Handle owner information
108 |             if 'owners' in drive_item:
109 |                 metadata['owner_names'] = [owner.get('displayName') for owner in drive_item['owners']]
110 |                 metadata['owner_emails'] = [owner.get('emailAddress') for owner in drive_item['owners']]
111 |             
112 |             # Handle last modifying user
113 |             if 'lastModifyingUser' in drive_item:
114 |                 metadata['last_modified_by'] = drive_item['lastModifyingUser'].get('displayName')
115 |             
116 |             # Handle parent folders
117 |             if 'parents' in drive_item:
118 |                 metadata['parent_folders'] = drive_item['parents']
119 |         
120 |         # Add auth/permission information if available
121 |         if auth_info and 'permissions' in auth_info:
122 |             metadata['permissions'] = auth_info['permissions']
123 |             
124 |             # Extract owners, editors, viewers, and commenters
125 |             access_summary = {
126 |                 'owners': [],
127 |                 'editors': [],
128 |                 'viewers': [],
129 |                 'commenters': [],
130 |                 'is_public': False
131 |             }
132 |             
133 |             for perm in auth_info['permissions']:
134 |                 role = perm.get('role', '').lower()
135 |                 email = perm.get('emailAddress')
136 |                 perm_type = perm.get('type')
137 |                 
138 |                 # Check if document is public
139 |                 if perm_type == 'anyone':
140 |                     access_summary['is_public'] = True
141 |                 
142 |                 # Add email to appropriate role list
143 |                 if email:
144 |                     if role == 'owner':
145 |                         access_summary['owners'].append(email)
146 |                     elif role == 'writer':
147 |                         access_summary['editors'].append(email)
148 |                     elif role == 'reader':
149 |                         access_summary['viewers'].append(email)
150 |                     elif role == 'commenter':
151 |                         access_summary['commenters'].append(email)
152 |             
153 |             metadata['access_summary'] = access_summary
154 |         
155 |         return metadata
156 |     
157 |     @staticmethod
158 |     def enrich_metadata(metadata, doc_type=None, additional_metadata=None):
159 |         """Enrich metadata with standard fields.
160 |         
161 |         Args:
162 |             metadata: Base metadata dictionary
163 |             doc_type: Document type
164 |             additional_metadata: Additional metadata to add
165 |             
166 |         Returns:
167 |             Enriched metadata dictionary
168 |         """
169 |         enriched = metadata.copy()
170 |         
171 |         # Add standard metadata fields
172 |         enriched['doc_type'] = doc_type or enriched.get('doc_type', 'unknown')
173 |         enriched['processed_at'] = datetime.now().isoformat()
174 |         
175 |         # Add language if not present
176 |         if 'language' not in enriched:
177 |             enriched['language'] = 'en'
178 |         
179 |         # Add content type based on mime_type if available
180 |         if 'mime_type' in enriched and 'content_type' not in enriched:
181 |             mime_type = enriched['mime_type'].lower()
182 |             if 'pdf' in mime_type:
183 |                 enriched['content_type'] = 'pdf'
184 |             elif 'word' in mime_type or 'docx' in mime_type or 'doc' in mime_type:
185 |                 enriched['content_type'] = 'document'
186 |             elif 'sheet' in mime_type or 'excel' in mime_type or 'xlsx' in mime_type:
187 |                 enriched['content_type'] = 'spreadsheet'
188 |             elif 'presentation' in mime_type or 'ppt' in mime_type:
189 |                 enriched['content_type'] = 'presentation'
190 |             elif 'text' in mime_type:
191 |                 enriched['content_type'] = 'text'
192 |             elif 'image' in mime_type:
193 |                 enriched['content_type'] = 'image'
194 |             else:
195 |                 enriched['content_type'] = 'other'
196 |         
197 |         # Add processing status
198 |         enriched['status'] = 'processed'
199 |         
200 |         # Add version info if not present
201 |         if 'version_info' not in enriched:
202 |             enriched['version_info'] = {
203 |                 'version': '1.0',
204 |                 'created_at': datetime.now().isoformat(),
205 |                 'updated_at': datetime.now().isoformat()
206 |             }
207 |         
208 |         # Add any additional metadata
209 |         if additional_metadata:
210 |             enriched.update(additional_metadata)
211 |         
212 |         return enriched
213 |     
214 |     @staticmethod
215 |     def check_document_version(doc_id, metadata, content, supabase_client=None, table_name=None):
216 |         """Check if document version needs updating.
217 |         
218 |         Args:
219 |             doc_id: Document ID
220 |             metadata: Document metadata
221 |             content: Document content
222 |             supabase_client: Supabase client
223 |             table_name: Table name
224 |             
225 |         Returns:
226 |             Tuple of (needs_update, existing_metadata, content_changed, metadata_update_only)
227 |             - needs_update: True if document needs version update
228 |             - existing_metadata: Existing document metadata or None if document is new
229 |             - content_changed: True if content has changed, False if only metadata changed
230 |             - metadata_update_only: True if only metadata changed but not content
231 |         """
232 |         if not supabase_client or not table_name:
233 |             # No way to check versions, assume it's new
234 |             return False, None, False, False
235 |         
236 |         try:
237 |             # Check if document exists
238 |             response = supabase_client.table(table_name).select("id, metadata, content").eq("id", doc_id).execute()
239 |             
240 |             if hasattr(response, 'data') and response.data:
241 |                 # Document exists, check if it needs updating
242 |                 existing_doc = response.data[0]
243 |                 existing_metadata = existing_doc.get('metadata', {})
244 |                 existing_content = existing_doc.get('content', '')
245 |                 
246 |                 # Initialize flags
247 |                 needs_update = False
248 |                 content_changed = False
249 |                 metadata_update_only = False
250 |                 
251 |                 # Check if content has changed - only this should trigger a re-embedding
252 |                 if content != existing_content:
253 |                     logger.info(f"Document {doc_id} content has changed, will update version and re-embed")
254 |                     needs_update = True
255 |                     content_changed = True
256 |                 
257 |                 # Check if modified_time has changed (if available)
258 |                 elif metadata.get('modified_time') and existing_metadata.get('modified_time'):
259 |                     if metadata['modified_time'] > existing_metadata['modified_time']:
260 |                         logger.info(f"Document {doc_id} modified time has changed, but content is identical - SKIPPING update")
261 |                         # We don't mark it for update since content is the same
262 |                         needs_update = False
263 |                         content_changed = False
264 |                         metadata_update_only = True
265 |                 
266 |                 # No changes detected
267 |                 if not needs_update and not metadata_update_only:
268 |                     logger.info(f"Document {doc_id} has not changed, will not update")
269 |                 
270 |                 return needs_update, existing_metadata, content_changed, metadata_update_only
271 |             else:
272 |                 # Document doesn't exist
273 |                 logger.info(f"Document {doc_id} is new, will create")
274 |                 return False, None, False, False
275 |                 
276 |         except Exception as e:
277 |             logger.warning(f"Error checking document version: {str(e)}")
278 |             # Assume it needs updating in case of error
279 |             return True, None, True, False
280 | 
281 | def list_drive_folders(auth=None, folder_id=None, depth=1):
282 |     """List folders in Google Drive.
283 |     
284 |     Args:
285 |         auth: GoogleDriveAuth instance
286 |         folder_id: ID of the folder to list (None for root)
287 |         depth: How many levels to traverse
288 |         
289 |     Returns:
290 |         Dictionary of folder structure
291 |     """
292 |     if auth is None:
293 |         from .auth import GoogleDriveAuth
294 |         auth = GoogleDriveAuth()
295 |     
296 |     service = auth.get_drive_service()
297 |     
298 |     def get_folder_details(folder_id, current_depth=0):
299 |         """Recursively get folder details."""
300 |         if current_depth >= depth:
301 |             return {"name": "...", "id": folder_id, "folders": []}
302 |         
303 |         if folder_id:
304 |             # Get folder details
305 |             folder = service.files().get(fileId=folder_id, fields="name,id").execute()
306 |             name = folder.get("name", "Unknown")
307 |             
308 |             # Get subfolders
309 |             query = f"'{folder_id}' in parents and mimeType='application/vnd.google-apps.folder' and trashed=false"
310 |         else:
311 |             # Root folder
312 |             name = "My Drive"
313 |             
314 |             # Get top-level folders
315 |             query = "mimeType='application/vnd.google-apps.folder' and 'root' in parents and trashed=false"
316 |         
317 |         # Get subfolders
318 |         results = service.files().list(q=query, fields="files(id, name)").execute()
319 |         subfolders = results.get("files", [])
320 |         
321 |         # Recursively get subfolder details
322 |         subfolder_details = []
323 |         for subfolder in subfolders:
324 |             if current_depth + 1 < depth:
325 |                 subfolder_details.append(get_folder_details(subfolder["id"], current_depth + 1))
326 |             else:
327 |                 subfolder_details.append({"name": subfolder["name"], "id": subfolder["id"], "folders": []})
328 |         
329 |         return {
330 |             "name": name,
331 |             "id": folder_id,
332 |             "folders": subfolder_details
333 |         }
334 |     
335 |     return get_folder_details(folder_id)
336 | 
337 | def process_with_timeout(func, *args, timeout=180, **kwargs):
338 |     """Execute a function with a timeout to prevent hanging.
339 |     
340 |     Args:
341 |         func: Function to execute
342 |         timeout: Maximum execution time in seconds
343 |         *args: Arguments to pass to the function
344 |         **kwargs: Keyword arguments to pass to the function
345 |         
346 |     Returns:
347 |         Result of the function or None if timeout
348 |     """
349 |     with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
350 |         future = executor.submit(func, *args, **kwargs)
351 |         try:
352 |             return future.result(timeout=timeout)
353 |         except concurrent.futures.TimeoutError:
354 |             logger.warning(f"Function {func.__name__} timed out after {timeout} seconds")
355 |             return None
356 |         except Exception as e:
357 |             logger.error(f"Function {func.__name__} failed with error: {str(e)}")
358 |             return None
359 |             
360 | def get_drive_permissions(file_id, drive_service, use_unverified_context=True, timeout=15):
361 |     """Get Google Drive permissions directly with enhanced error handling.
362 |     
363 |     Args:
364 |         file_id: Google Drive file ID
365 |         drive_service: Google Drive API service instance
366 |         use_unverified_context: Whether to use unverified SSL context
367 |         timeout: Timeout in seconds (reduced to 15s to prevent long hangs)
368 |         
369 |     Returns:
370 |         List of permissions or empty list if error
371 |     """
372 |     # Save original SSL context
373 |     original_ssl_context = ssl._create_default_https_context
374 |     original_timeout = socket.getdefaulttimeout()
375 |     
376 |     try:
377 |         # Apply timeout
378 |         socket.setdefaulttimeout(timeout)
379 |         
380 |         # Use unverified context if requested (helps with SSL issues)
381 |         if use_unverified_context:
382 |             ssl._create_default_https_context = ssl._create_unverified_context
383 |         
384 |         # Get permissions with the correct parameters
385 |         permissions_response = drive_service.permissions().list(
386 |             fileId=file_id,
387 |             fields="permissions(id, type, emailAddress, role, displayName)",
388 |             supportsAllDrives=True,
389 |             pageSize=100
390 |         ).execute()
391 |         
392 |         permissions = permissions_response.get('permissions', [])
393 |         
394 |         # Handle pagination if there are more permissions
395 |         page_token = permissions_response.get('nextPageToken')
396 |         while page_token:
397 |             next_page = drive_service.permissions().list(
398 |                 fileId=file_id,
399 |                 fields="permissions(id, type, emailAddress, role, displayName)",
400 |                 supportsAllDrives=True,
401 |                 pageSize=100,
402 |                 pageToken=page_token
403 |             ).execute()
404 |             permissions.extend(next_page.get('permissions', []))
405 |             page_token = next_page.get('nextPageToken')
406 |         
407 |         logger.info(f"Successfully fetched {len(permissions)} permissions for {file_id}")
408 |         return permissions
409 |     
410 |     except Exception as e:
411 |         # Check if this is an SSL error and provide a reassuring message
412 |         if "SSL:" in str(e):
413 |             logger.info(f"SSL connection issue detected for permissions - this is normal and will be handled during processing")
414 |             logger.debug(f"SSL details: {str(e)}")
415 |         else:
416 |             logger.warning(f"Could not fetch permissions for file {file_id}: {str(e)}")
417 |         
418 |         return []
419 |     
420 |     finally:
421 |         # Restore original SSL context and timeout
422 |         ssl._create_default_https_context = original_ssl_context
423 |         socket.setdefaulttimeout(original_timeout)
424 | 
425 | def process_documents_parallel(processor_func, documents, max_workers=None, chunk_size=1, timeout_per_doc=60):
426 |     """Process a list of documents in parallel.
427 |     
428 |     Args:
429 |         processor_func: Function to process each document
430 |         documents: List of documents to process
431 |         max_workers: Maximum number of parallel workers
432 |         chunk_size: Number of documents to process per worker
433 |         timeout_per_doc: Maximum time per document in seconds
434 |         
435 |     Returns:
436 |         List of processing results
437 |     """
438 |     if not documents:
439 |         return []
440 |         
441 |     # Determine number of workers based on CPU count
442 |     if max_workers is None:
443 |         max_workers = min(len(documents), os.cpu_count() or 4)
444 |     
445 |     logger.info(f"Processing {len(documents)} documents in parallel with {max_workers} workers")
446 |     results = []
447 |     
448 |     # Use ThreadPoolExecutor instead of ProcessPoolExecutor to avoid pickle errors
449 |     with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
450 |         futures = {}
451 |         for i, doc in enumerate(documents):
452 |             future = executor.submit(processor_func, doc)
453 |             futures[future] = i
454 |             
455 |         # Track completion and apply timeout
456 |         start_time = time.time()
457 |         completed = 0
458 |         for future in concurrent.futures.as_completed(futures.keys()):
459 |             doc_index = futures[future]
460 |             try:
461 |                 # Check if this document has been processing too long
462 |                 if time.time() - start_time > timeout_per_doc:
463 |                     logger.warning(f"Document {doc_index} processing timed out after {timeout_per_doc} seconds")
464 |                     results.append((None, None))
465 |                 else:
466 |                     result = future.result(timeout=timeout_per_doc)
467 |                     results.append(result)
468 |                     completed += 1
469 |                     logger.debug(f"Completed document {doc_index} ({completed}/{len(documents)})")
470 |             except concurrent.futures.TimeoutError:
471 |                 logger.warning(f"Document {doc_index} timed out during result collection")
472 |                 results.append((None, None))
473 |             except Exception as e:
474 |                 logger.error(f"Error processing document {doc_index}: {str(e)}")
475 |                 results.append((None, None))
476 |                 
477 |     return results


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # Core dependencies
 2 | langchain>=0.1.0
 3 | langchain-community>=0.1.0
 4 | langchain-openai>=0.0.5
 5 | langchain-google-community[drive]>=0.0.10
 6 | langchain-googledrive>=0.0.10
 7 | openai>=1.0.0
 8 | 
 9 | # Google Drive integration
10 | google-auth>=2.23.0
11 | google-auth-httplib2>=0.1.0
12 | google-api-python-client>=2.0.0
13 | 
14 | # Database and vector storage
15 | supabase>=2.0.0
16 | postgrest>=0.11.0
17 | 
18 | # Document processing
19 | pypdf>=3.15.0
20 | unstructured>=0.10.0
21 | docx2txt>=0.8
22 | python-docx>=0.8.11
23 | 
24 | # OCR and image processing
25 | pytesseract>=0.3.10
26 | opencv-python>=4.8.0
27 | Pillow>=10.0.0
28 | 
29 | # Table extraction from PDFs
30 | tabula-py>=2.8.0
31 | camelot-py[cv]>=0.11.0
32 | pandas>=2.0.0
33 | ghostscript>=0.7
34 | 
35 | # Environment and utilities
36 | python-dotenv>=1.0.0
37 | requests>=2.30.0
38 | urllib3>=2.0.0
39 | 
40 | # Enhanced terminal output
41 | rich>=13.6.0
42 | typer>=0.9.0
43 | pyfiglet>=0.8.post1
44 | colorama>=0.4.6
45 | humanize>=4.8.0
46 | tabulate>=0.9.0 


--------------------------------------------------------------------------------
/run_examples.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Install required Python dependencies if needed
 4 | pip install pyfiglet humanize rich tabulate colorama
 5 | pip install --upgrade langchain langchain-community langchain-openai langchain-google-community langchain-text-splitters
 6 | 
 7 | # Set the PYTHONPATH to include only the current directory
 8 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 9 | export PYTHONPATH=$SCRIPT_DIR
10 | 
11 | # Run the examples
12 | cd $SCRIPT_DIR
13 | python examples/langchain_examples_new.py
14 | 


--------------------------------------------------------------------------------
/setup.bat:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | ECHO Setting up n8n RAG example environment...
 3 | 
 4 | :: Check for Python
 5 | python --version > NUL 2>&1
 6 | IF %ERRORLEVEL% NEQ 0 (
 7 |     ECHO Python is not installed or not in PATH. Please install Python 3.9+.
 8 |     EXIT /B 1
 9 | )
10 | 
11 | :: Create virtual environment
12 | IF NOT EXIST venv (
13 |     ECHO Creating virtual environment...
14 |     python -m venv venv
15 |     IF %ERRORLEVEL% NEQ 0 (
16 |         ECHO Failed to create virtual environment.
17 |         EXIT /B 1
18 |     )
19 | )
20 | 
21 | :: Activate virtual environment
22 | ECHO Activating virtual environment...
23 | CALL venv\Scripts\activate
24 | 
25 | :: Install requirements
26 | ECHO Installing dependencies...
27 | pip install -U pip setuptools wheel
28 | pip install -r requirements.txt
29 | 
30 | :: Create config directory if it doesn't exist
31 | IF NOT EXIST config (
32 |     mkdir config
33 | )
34 | 
35 | :: Create .env file if it doesn't exist
36 | IF NOT EXIST .env (
37 |     ECHO Creating .env file from template...
38 |     COPY .env.example .env
39 |     ECHO Please edit .env with your API keys and credentials.
40 | )
41 | 
42 | ECHO.
43 | ECHO Setup complete! To run the example:
44 | ECHO 1. Activate the virtual environment: venv\Scripts\activate
45 | ECHO 2. Run the example: python examples\langchain_examples_new.py
46 | 
47 | PAUSE 


--------------------------------------------------------------------------------
/setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Simple setup script for n8n-rag-example
 3 | 
 4 | echo "Setting up n8n RAG example environment..."
 5 | 
 6 | # Create virtual environment
 7 | if [ ! -d "venv" ]; then
 8 |     echo "Creating virtual environment..."
 9 |     python -m venv venv
10 |     if [ $? -ne 0 ]; then
11 |         echo "Failed to create virtual environment. Please ensure Python 3.9+ is installed."
12 |         exit 1
13 |     fi
14 | fi
15 | 
16 | # Activate virtual environment
17 | if [[ "$OSTYPE" == "msys" || "$OSTYPE" == "win32" ]]; then
18 |     # Windows using Git Bash or similar
19 |     source venv/Scripts/activate
20 | else
21 |     # Linux/macOS
22 |     source venv/bin/activate
23 | fi
24 | 
25 | # Install requirements
26 | echo "Installing dependencies..."
27 | pip install -U pip setuptools wheel
28 | pip install -r requirements.txt
29 | 
30 | # Create config directory if it doesn't exist
31 | if [ ! -d "config" ]; then
32 |     mkdir -p config
33 | fi
34 | 
35 | # Create .env file if it doesn't exist
36 | if [ ! -f ".env" ]; then
37 |     echo "Creating .env file from template..."
38 |     cp .env.example .env
39 |     echo "Please edit .env with your API keys and credentials."
40 | fi
41 | 
42 | echo "Setup complete! To run the example:"
43 | echo "1. Activate the virtual environment: source venv/bin/activate"
44 | echo "2. Run the example: python examples/langchain_examples_new.py" 


--------------------------------------------------------------------------------
/setup_vector_store.sql:
--------------------------------------------------------------------------------
  1 | -- Enable the pgvector extension
  2 | CREATE EXTENSION IF NOT EXISTS vector;
  3 | 
  4 | -------------------------------------------------------------------------------
  5 | -- 1. langchain_docs (ID as text)
  6 | -------------------------------------------------------------------------------
  7 | DROP TABLE IF EXISTS "langchain_docs";
  8 | CREATE TABLE "langchain_docs" (
  9 |     id TEXT PRIMARY KEY,
 10 |     content TEXT,
 11 |     metadata JSONB,
 12 |     embedding VECTOR(1536)
 13 | );
 14 | 
 15 | CREATE INDEX IF NOT EXISTS "langchain_docs_embedding_idx"
 16 |     ON "langchain_docs"
 17 |     USING ivfflat (embedding vector_l2_ops)
 18 |     WITH (lists = 100);
 19 | 
 20 | CREATE INDEX IF NOT EXISTS "langchain_docs_metadata_idx"
 21 |     ON "langchain_docs"
 22 |     USING GIN (metadata);
 23 | 
 24 | CREATE INDEX IF NOT EXISTS "langchain_docs_content_idx"
 25 |     ON "langchain_docs"
 26 |     USING GIN (to_tsvector('english', COALESCE(content, '')));
 27 | 
 28 | -------------------------------------------------------------------------------
 29 | -- 2. langchain_example (ID as text)
 30 | -------------------------------------------------------------------------------
 31 | DROP TABLE IF EXISTS "langchain_example";
 32 | CREATE TABLE "langchain_example" (
 33 |     id TEXT PRIMARY KEY,
 34 |     content TEXT,
 35 |     metadata JSONB,
 36 |     embedding VECTOR(1536)
 37 | );
 38 | 
 39 | CREATE INDEX IF NOT EXISTS "langchain_example_embedding_idx"
 40 |     ON "langchain_example"
 41 |     USING ivfflat (embedding vector_l2_ops)
 42 |     WITH (lists = 100);
 43 | 
 44 | CREATE INDEX IF NOT EXISTS "langchain_example_metadata_idx"
 45 |     ON "langchain_example"
 46 |     USING GIN (metadata);
 47 | 
 48 | CREATE INDEX IF NOT EXISTS "langchain_example_content_idx"
 49 |     ON "langchain_example"
 50 |     USING GIN (to_tsvector('english', COALESCE(content, '')));
 51 | 
 52 | -------------------------------------------------------------------------------
 53 | -- 3. documents (ID as bigserial - auto increments)
 54 | -------------------------------------------------------------------------------
 55 | DROP TABLE IF EXISTS "documents";
 56 | CREATE TABLE "documents" (
 57 |     id BIGSERIAL PRIMARY KEY,
 58 |     content TEXT,
 59 |     metadata JSONB,
 60 |     embedding VECTOR(1536)
 61 | );
 62 | 
 63 | CREATE INDEX IF NOT EXISTS "documents_embedding_idx"
 64 |     ON "documents"
 65 |     USING ivfflat (embedding vector_l2_ops)
 66 |     WITH (lists = 100);
 67 | 
 68 | CREATE INDEX IF NOT EXISTS "documents_metadata_idx"
 69 |     ON "documents"
 70 |     USING GIN (metadata);
 71 | 
 72 | CREATE INDEX IF NOT EXISTS "documents_content_idx"
 73 |     ON "documents"
 74 |     USING GIN (to_tsvector('english', COALESCE(content, '')));
 75 | 
 76 | -------------------------------------------------------------------------------
 77 | -- Functions for langchain_docs
 78 | -------------------------------------------------------------------------------
 79 | 
 80 | -- Vector-based similarity on langchain_docs
 81 | CREATE OR REPLACE FUNCTION match_documents (
 82 |     query_embedding VECTOR(1536),
 83 |     match_count INT,
 84 |     filter JSONB DEFAULT '{}'
 85 | )
 86 | RETURNS TABLE (
 87 |     id TEXT,
 88 |     content TEXT,
 89 |     metadata JSONB,
 90 |     similarity FLOAT
 91 | )
 92 | LANGUAGE plpgsql
 93 | AS $$
 94 | BEGIN
 95 |     RETURN QUERY
 96 |     SELECT
 97 |         langchain_docs.id,
 98 |         langchain_docs.content,
 99 |         langchain_docs.metadata,
100 |         1 - (langchain_docs.embedding <=> query_embedding) AS similarity
101 |     FROM langchain_docs
102 |     WHERE metadata @> filter
103 |     ORDER BY langchain_docs.embedding <=> query_embedding
104 |     LIMIT match_count;
105 | END;
106 | $$;
107 | 
108 | -- Vector-based similarity on langchain_example
109 | CREATE OR REPLACE FUNCTION match_example_documents (
110 |     query_embedding VECTOR(1536),
111 |     match_count INT,
112 |     filter JSONB DEFAULT '{}'
113 | )
114 | RETURNS TABLE (
115 |     id TEXT,
116 |     content TEXT,
117 |     metadata JSONB,
118 |     similarity FLOAT
119 | )
120 | LANGUAGE plpgsql
121 | AS $$
122 | BEGIN
123 |     RETURN QUERY
124 |     SELECT
125 |         langchain_example.id,
126 |         langchain_example.content,
127 |         langchain_example.metadata,
128 |         1 - (langchain_example.embedding <=> query_embedding) AS similarity
129 |     FROM langchain_example
130 |     WHERE metadata @> filter
131 |     ORDER BY langchain_example.embedding <=> query_embedding
132 |     LIMIT match_count;
133 | END;
134 | $$;
135 | 
136 | -- Hybrid search (keyword + vector) on langchain_docs
137 | CREATE OR REPLACE FUNCTION kw_match_documents (
138 |     query_text TEXT,
139 |     query_embedding VECTOR(1536),
140 |     match_count INT,
141 |     filter JSONB DEFAULT '{}'
142 | )
143 | RETURNS TABLE (
144 |     id TEXT,
145 |     content TEXT,
146 |     metadata JSONB,
147 |     similarity FLOAT,
148 |     text_similarity FLOAT
149 | )
150 | LANGUAGE plpgsql
151 | AS $$
152 | BEGIN
153 |     RETURN QUERY
154 |     SELECT
155 |         langchain_docs.id,
156 |         langchain_docs.content,
157 |         langchain_docs.metadata,
158 |         1 - (langchain_docs.embedding <=> query_embedding) AS similarity,
159 |         ts_rank(
160 |             to_tsvector('english', COALESCE(langchain_docs.content, '')),
161 |             plainto_tsquery('english', query_text)
162 |         ) AS text_similarity
163 |     FROM langchain_docs
164 |     WHERE metadata @> filter
165 |       AND (
166 |           to_tsvector('english', COALESCE(langchain_docs.content, ''))
167 |           @@ plainto_tsquery('english', query_text)
168 |           OR 1 - (langchain_docs.embedding <=> query_embedding) > 0.7
169 |       )
170 |     ORDER BY 
171 |         (
172 |           (1 - (langchain_docs.embedding <=> query_embedding)) * 0.8
173 |           + ts_rank(
174 |                 to_tsvector('english', COALESCE(langchain_docs.content, '')),
175 |                 plainto_tsquery('english', query_text)
176 |             ) * 0.2
177 |         ) DESC
178 |     LIMIT match_count;
179 | END;
180 | $$;
181 | 
182 | -------------------------------------------------------------------------------
183 | -- New Functions for the "documents" table
184 | -------------------------------------------------------------------------------
185 | 
186 | -- 1) Vector-based similarity on the "documents" table
187 | CREATE OR REPLACE FUNCTION match_main_documents (
188 |     query_embedding VECTOR(1536),
189 |     match_count INT,
190 |     filter JSONB DEFAULT '{}'
191 | )
192 | RETURNS TABLE (
193 |     id BIGINT,
194 |     content TEXT,
195 |     metadata JSONB,
196 |     similarity FLOAT
197 | )
198 | LANGUAGE plpgsql
199 | AS $$
200 | BEGIN
201 |     RETURN QUERY
202 |     SELECT
203 |         documents.id,
204 |         documents.content,
205 |         documents.metadata,
206 |         1 - (documents.embedding <=> query_embedding) AS similarity
207 |     FROM documents
208 |     WHERE metadata @> filter
209 |     ORDER BY documents.embedding <=> query_embedding
210 |     LIMIT match_count;
211 | END;
212 | $$;
213 | 
214 | -- 2) Hybrid search (keyword + vector) on the "documents" table
215 | CREATE OR REPLACE FUNCTION kw_match_main_documents (
216 |     query_text TEXT,
217 |     query_embedding VECTOR(1536),
218 |     match_count INT,
219 |     filter JSONB DEFAULT '{}'
220 | )
221 | RETURNS TABLE (
222 |     id BIGINT,
223 |     content TEXT,
224 |     metadata JSONB,
225 |     similarity FLOAT,
226 |     text_similarity FLOAT
227 | )
228 | LANGUAGE plpgsql
229 | AS $$
230 | BEGIN
231 |     RETURN QUERY
232 |     SELECT
233 |         documents.id,
234 |         documents.content,
235 |         documents.metadata,
236 |         1 - (documents.embedding <=> query_embedding) AS similarity,
237 |         ts_rank(
238 |             to_tsvector('english', COALESCE(documents.content, '')),
239 |             plainto_tsquery('english', query_text)
240 |         ) AS text_similarity
241 |     FROM documents
242 |     WHERE metadata @> filter
243 |       AND (
244 |           to_tsvector('english', COALESCE(documents.content, ''))
245 |           @@ plainto_tsquery('english', query_text)
246 |           OR 1 - (documents.embedding <=> query_embedding) > 0.7
247 |       )
248 |     ORDER BY 
249 |         (
250 |           (1 - (documents.embedding <=> query_embedding)) * 0.8
251 |           + ts_rank(
252 |                 to_tsvector('english', COALESCE(documents.content, '')),
253 |                 plainto_tsquery('english', query_text)
254 |             ) * 0.2
255 |         ) DESC
256 |     LIMIT match_count;
257 | END;
258 | $$;
259 | 


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
1 | """Utils package for the n8n RAG Example."""
2 | 
3 | # Import and expose the display manager
4 | from .display_utils import display 


--------------------------------------------------------------------------------
/utils/display_utils.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Advanced terminal output utilities for displaying RAG pipeline progress,
  4 | statistics, and results in a visually appealing and informative way.
  5 | """
  6 | 
  7 | import time
  8 | import json
  9 | import sys
 10 | from typing import Dict, List, Any, Optional, Union, Callable
 11 | import humanize
 12 | from datetime import datetime
 13 | from rich.console import Console
 14 | from rich.progress import Progress, TextColumn, BarColumn, TaskProgressColumn, TimeElapsedColumn
 15 | from rich.panel import Panel
 16 | from rich.text import Text
 17 | from rich.table import Table
 18 | from rich.syntax import Syntax
 19 | from rich.tree import Tree
 20 | from rich.live import Live
 21 | from rich.layout import Layout
 22 | from rich import box
 23 | from rich.prompt import Prompt, Confirm
 24 | from pyfiglet import Figlet
 25 | import colorama
 26 | from colorama import Fore, Style
 27 | from tabulate import tabulate
 28 | 
 29 | # Initialize colorama for cross-platform color support
 30 | colorama.init()
 31 | 
 32 | # Initialize rich console
 33 | console = Console()
 34 | 
 35 | class DisplayManager:
 36 |     """Manages advanced terminal displays for the RAG pipeline."""
 37 |     
 38 |     def __init__(self, show_advanced_output: bool = True, color_enabled: bool = True):
 39 |         """Initialize the display manager.
 40 |         
 41 |         Args:
 42 |             show_advanced_output: Whether to show advanced/detailed output
 43 |             color_enabled: Whether to use colored output
 44 |         """
 45 |         self.show_advanced_output = show_advanced_output
 46 |         self.color_enabled = color_enabled
 47 |         self.console = console
 48 |         self.start_time = time.time()
 49 |         
 50 |     def print_header(self, title: str, subtitle: Optional[str] = None):
 51 |         """Print a fancy header with optional subtitle.
 52 |         
 53 |         Args:
 54 |             title: The main title text
 55 |             subtitle: Optional subtitle text
 56 |         """
 57 |         if not self.show_advanced_output:
 58 |             print(f"\n=== {title} ===")
 59 |             if subtitle:
 60 |                 print(f"{subtitle}\n")
 61 |             return
 62 |             
 63 |         f = Figlet(font='slant')
 64 |         header_text = f.renderText(title)
 65 |         
 66 |         self.console.print("\n")
 67 |         self.console.print(Panel(
 68 |             Text(header_text, style="bold blue"),
 69 |             subtitle=subtitle,
 70 |             expand=False,
 71 |             border_style="blue",
 72 |             padding=(1, 3)
 73 |         ))
 74 |         self.console.print("\n")
 75 |         
 76 |     def print_section(self, title: str, description: Optional[str] = None):
 77 |         """Print a section header.
 78 |         
 79 |         Args:
 80 |             title: The section title
 81 |             description: Optional description
 82 |         """
 83 |         if not self.show_advanced_output:
 84 |             print(f"\n--- {title} ---")
 85 |             if description:
 86 |                 print(f"{description}")
 87 |             return
 88 |         
 89 |         self.console.print(f"\n[bold cyan]■ {title}[/bold cyan]")
 90 |         if description:
 91 |             self.console.print(f"  [dim]{description}[/dim]")
 92 |             
 93 |     def print_step(self, step_num: int, total_steps: int, title: str, description: Optional[str] = None):
 94 |         """Print a step header within a section.
 95 |         
 96 |         Args:
 97 |             step_num: Current step number
 98 |             total_steps: Total number of steps
 99 |             title: Step title
100 |             description: Optional step description
101 |         """
102 |         if not self.show_advanced_output:
103 |             print(f"\n[{step_num}/{total_steps}] {title}")
104 |             if description:
105 |                 print(f"  {description}")
106 |             return
107 |             
108 |         self.console.print(f"\n[bold green]Step {step_num}/{total_steps}:[/bold green] [yellow]{title}[/yellow]")
109 |         if description:
110 |             self.console.print(f"  [dim]{description}[/dim]")
111 |             
112 |     def print_info(self, message: str, prefix: Optional[str] = None):
113 |         """Print an info message.
114 |         
115 |         Args:
116 |             message: The info message
117 |             prefix: Optional prefix to add
118 |         """
119 |         prefix_text = f"{prefix}: " if prefix else ""
120 |         
121 |         if not self.show_advanced_output:
122 |             print(f"INFO: {prefix_text}{message}")
123 |             return
124 |             
125 |         self.console.print(f"[blue]ℹ[/blue] {prefix_text}[white]{message}[/white]")
126 |         
127 |     def print_success(self, message: str):
128 |         """Print a success message.
129 |         
130 |         Args:
131 |             message: The success message
132 |         """
133 |         if not self.show_advanced_output:
134 |             print(f"SUCCESS: {message}")
135 |             return
136 |             
137 |         self.console.print(f"[bold green]✓[/bold green] {message}")
138 |         
139 |     def print_warning(self, message: str):
140 |         """Print a warning message.
141 |         
142 |         Args:
143 |             message: The warning message
144 |         """
145 |         if not self.show_advanced_output:
146 |             print(f"WARNING: {message}")
147 |             return
148 |             
149 |         self.console.print(f"[bold yellow]⚠[/bold yellow] {message}")
150 |         
151 |     def print_error(self, message: str):
152 |         """Print an error message.
153 |         
154 |         Args:
155 |             message: The error message
156 |         """
157 |         if not self.show_advanced_output:
158 |             print(f"ERROR: {message}")
159 |             return
160 |             
161 |         self.console.print(f"[bold red]✗[/bold red] {message}")
162 |         
163 |     def print_file_info(self, file_name: str, mime_type: str, file_id: str, size: Optional[int] = None):
164 |         """Print file information.
165 |         
166 |         Args:
167 |             file_name: Name of the file
168 |             mime_type: MIME type of the file
169 |             file_id: ID of the file
170 |             size: Optional size of the file in bytes
171 |         """
172 |         if not self.show_advanced_output:
173 |             size_str = f" ({humanize.naturalsize(size)})" if size else ""
174 |             print(f"File: {file_name}{size_str} [{mime_type}] ID: {file_id}")
175 |             return
176 |             
177 |         size_str = f" ({humanize.naturalsize(size)})" if size else ""
178 |         
179 |         # Get file extension and use it for icon selection
180 |         ext = file_name.split('.')[-1].lower() if '.' in file_name else ''
181 |         icon = self._get_file_icon(ext, mime_type)
182 |         
183 |         self.console.print(f"[bold]{icon}[/bold] [cyan]{file_name}[/cyan]{size_str}")
184 |         self.console.print(f"  [dim]Type:[/dim] [yellow]{mime_type}[/yellow]")
185 |         self.console.print(f"  [dim]ID:[/dim] [blue]{file_id}[/blue]")
186 |         
187 |     def _get_file_icon(self, extension: str, mime_type: str) -> str:
188 |         """Get an appropriate icon for a file based on its extension or MIME type.
189 |         
190 |         Args:
191 |             extension: File extension
192 |             mime_type: MIME type
193 |             
194 |         Returns:
195 |             An icon character representing the file type
196 |         """
197 |         if 'pdf' in extension or 'pdf' in mime_type:
198 |             return "📄"
199 |         elif 'doc' in extension or 'word' in mime_type:
200 |             return "📝"
201 |         elif 'xls' in extension or 'sheet' in mime_type:
202 |             return "📊"
203 |         elif 'ppt' in extension or 'presentation' in mime_type:
204 |             return "📽️"
205 |         elif 'txt' in extension or 'text' in mime_type:
206 |             return "📄"
207 |         elif 'jpg' in extension or 'jpeg' in extension or 'png' in extension or 'image' in mime_type:
208 |             return "🖼️"
209 |         else:
210 |             return "📄"
211 |             
212 |     def create_progress_bar(self, total: int, description: str) -> Progress:
213 |         """Create a rich progress bar.
214 |         
215 |         Args:
216 |             total: Total number of items to process
217 |             description: Description of the process
218 |             
219 |         Returns:
220 |             A Progress object for tracking progress
221 |         """
222 |         if not self.show_advanced_output:
223 |             return None
224 |             
225 |         return Progress(
226 |             TextColumn("[bold blue]{task.description}"),
227 |             BarColumn(bar_width=50),
228 |             TaskProgressColumn(),
229 |             TimeElapsedColumn()
230 |         )
231 |         
232 |     def print_document_stats(self, docs_count: int, chunks_count: int, metadata_fields: List[str]):
233 |         """Print statistics about processed documents.
234 |         
235 |         Args:
236 |             docs_count: Number of documents processed
237 |             chunks_count: Number of chunks generated
238 |             metadata_fields: List of metadata fields captured
239 |         """
240 |         if not self.show_advanced_output:
241 |             print(f"\nDocument Stats:")
242 |             print(f"- Documents: {docs_count}")
243 |             print(f"- Chunks: {chunks_count}")
244 |             print(f"- Avg chunks per doc: {chunks_count/max(1, docs_count):.1f}")
245 |             print(f"- Metadata fields: {', '.join(metadata_fields[:5])}...")
246 |             return
247 |             
248 |         table = Table(title="📊 Document Processing Statistics", box=box.ROUNDED)
249 |         
250 |         table.add_column("Metric", style="cyan")
251 |         table.add_column("Value", style="green")
252 |         
253 |         table.add_row("Documents Processed", str(docs_count))
254 |         table.add_row("Chunks Generated", str(chunks_count))
255 |         
256 |         # Avoid division by zero
257 |         chunks_per_doc = chunks_count / max(1, docs_count) if docs_count > 0 else 0
258 |         table.add_row("Chunks per Document", f"{chunks_per_doc:.1f}")
259 |         
260 |         table.add_row("Processing Time", f"{time.time() - self.start_time:.2f}s")
261 |         
262 |         metadata_display = ", ".join(metadata_fields[:7])
263 |         if len(metadata_fields) > 7:
264 |             metadata_display += ", ..."
265 |             
266 |         table.add_row("Metadata Fields", metadata_display)
267 |         
268 |         self.console.print(table)
269 |         
270 |     def print_embedding_info(self, model: str, vector_dimension: int, processing_time: float, batch_size: int = 0):
271 |         """Print information about the embedding process.
272 |         
273 |         Args:
274 |             model: Embedding model name
275 |             vector_dimension: Dimension of vectors
276 |             processing_time: Time taken for embedding in seconds
277 |             batch_size: Batch size used for embedding
278 |         """
279 |         if not self.show_advanced_output:
280 |             print(f"\nEmbedding Info:")
281 |             print(f"- Model: {model}")
282 |             print(f"- Dimensions: {vector_dimension}")
283 |             print(f"- Processing time: {processing_time:.2f}s")
284 |             if batch_size:
285 |                 print(f"- Batch size: {batch_size}")
286 |             return
287 |             
288 |         panel = Panel(
289 |             Text.from_markup(
290 |                 f"[bold]Model:[/bold] [green]{model}[/green]\n"
291 |                 f"[bold]Vector Dimension:[/bold] [blue]{vector_dimension}[/blue]\n"
292 |                 f"[bold]Processing Time:[/bold] [yellow]{processing_time:.2f}s[/yellow]" +
293 |                 (f"\n[bold]Batch Size:[/bold] [magenta]{batch_size}[/magenta]" if batch_size else "")
294 |             ),
295 |             title="🔢 Embedding Information",
296 |             border_style="cyan"
297 |         )
298 |         
299 |         self.console.print(panel)
300 |         
301 |     def print_metadata_example(self, metadata: Dict[str, Any]):
302 |         """Print an example of the document metadata.
303 |         
304 |         Args:
305 |             metadata: A metadata dictionary to display
306 |         """
307 |         if not self.show_advanced_output:
308 |             print(f"\nMetadata Example:")
309 |             for key, value in list(metadata.items())[:5]:
310 |                 print(f"- {key}: {value}")
311 |             if len(metadata) > 5:
312 |                 print("- ...")
313 |             return
314 |             
315 |         # Create a nicely formatted JSON representation
316 |         json_str = json.dumps(metadata, indent=2)
317 |         
318 |         self.console.print(Panel(
319 |             Syntax(json_str, "json", theme="monokai", line_numbers=True),
320 |             title="📋 Metadata Example",
321 |             border_style="green",
322 |             expand=False
323 |         ))
324 |         
325 |     def print_chunk_example(self, text: str, metadata: Dict[str, Any]):
326 |         """Print an example of a document chunk with its metadata.
327 |         
328 |         Args:
329 |             text: Chunk text content
330 |             metadata: Chunk metadata
331 |         """
332 |         if not self.show_advanced_output:
333 |             print(f"\nChunk Example (first 100 chars):")
334 |             print(f"{text[:100]}...")
335 |             print("Metadata:")
336 |             for key, value in list(metadata.items())[:3]:
337 |                 print(f"- {key}: {value}")
338 |             if len(metadata) > 3:
339 |                 print("- ...")
340 |             return
341 |             
342 |         layout = Layout()
343 |         layout.split_column(
344 |             Layout(Panel(
345 |                 Text(text[:500] + "..." if len(text) > 500 else text, 
346 |                      style="white"),
347 |                 title="📄 Chunk Content Preview",
348 |                 border_style="blue",
349 |                 padding=(1, 2)
350 |             )),
351 |             Layout(Panel(
352 |                 Syntax(json.dumps(metadata, indent=2), "json", theme="monokai"),
353 |                 title="🏷️  Chunk Metadata",
354 |                 border_style="green",
355 |                 padding=(1, 2)
356 |             ))
357 |         )
358 |         
359 |         self.console.print(layout)
360 |         
361 |     def print_vector_storage_info(self, store_type: str, table_name: str, 
362 |                                   total_stored: int, failed: int = 0,
363 |                                   index_type: Optional[str] = None):
364 |         """Print information about the vector storage.
365 |         
366 |         Args:
367 |             store_type: Type of vector store (e.g., 'Supabase', 'Chroma')
368 |             table_name: Name of the table or collection
369 |             total_stored: Total number of vectors stored
370 |             failed: Number of storage operations that failed
371 |             index_type: Type of index used (if applicable)
372 |         """
373 |         if not self.show_advanced_output:
374 |             print(f"\nVector Storage:")
375 |             print(f"- Type: {store_type}")
376 |             print(f"- Table/Collection: {table_name}")
377 |             print(f"- Vectors stored: {total_stored}")
378 |             if failed:
379 |                 print(f"- Failed operations: {failed}")
380 |             if index_type:
381 |                 print(f"- Index type: {index_type}")
382 |             return
383 |             
384 |         # Use a different icon based on the store type
385 |         icon = "💾" if store_type.lower() == "supabase" else "🧠"
386 |         
387 |         success_rate = ((total_stored - failed) / total_stored * 100) if total_stored > 0 else 0
388 |         
389 |         table = Table(title=f"{icon} Vector Storage Summary", box=box.ROUNDED)
390 |         
391 |         table.add_column("Property", style="cyan")
392 |         table.add_column("Value", style="green")
393 |         
394 |         table.add_row("Store Type", store_type)
395 |         table.add_row("Table/Collection", table_name)
396 |         table.add_row("Total Vectors", str(total_stored))
397 |         
398 |         if failed:
399 |             table.add_row("Failed Operations", f"[red]{failed}[/red]")
400 |             table.add_row("Success Rate", f"{success_rate:.1f}%")
401 |             
402 |         if index_type:
403 |             table.add_row("Index Type", index_type)
404 |             
405 |         self.console.print(table)
406 |         
407 |     def print_performance_metrics(self, metrics: Dict[str, Union[float, int]]):
408 |         """Print performance metrics.
409 |         
410 |         Args:
411 |             metrics: Dictionary of metrics to display
412 |         """
413 |         if not self.show_advanced_output:
414 |             print(f"\nPerformance Metrics:")
415 |             for key, value in metrics.items():
416 |                 if isinstance(value, float):
417 |                     print(f"- {key}: {value:.2f}")
418 |                 else:
419 |                     print(f"- {key}: {value}")
420 |             return
421 |             
422 |         table = Table(title="⚡ Performance Metrics", box=box.ROUNDED)
423 |         
424 |         table.add_column("Metric", style="cyan")
425 |         table.add_column("Value", style="green")
426 |         
427 |         for key, value in metrics.items():
428 |             if isinstance(value, float):
429 |                 formatted_value = f"{value:.2f}"
430 |             else:
431 |                 formatted_value = str(value)
432 |                 
433 |             table.add_row(key, formatted_value)
434 |             
435 |         self.console.print(table)
436 |         
437 |     def print_advanced_rag_feature(self, feature_name: str, description: str, 
438 |                                   standard_approach: str, our_approach: str,
439 |                                   benefits: List[str]):
440 |         """Highlight an advanced RAG feature with comparison to standard approaches.
441 |         
442 |         Args:
443 |             feature_name: Name of the feature
444 |             description: Description of the feature
445 |             standard_approach: How it's done in standard/basic RAG
446 |             our_approach: How our implementation does it
447 |             benefits: List of benefits of our approach
448 |         """
449 |         if not self.show_advanced_output:
450 |             print(f"\nAdvanced Feature: {feature_name}")
451 |             print(f"- Description: {description}")
452 |             print(f"- Standard approach: {standard_approach}")
453 |             print(f"- Our approach: {our_approach}")
454 |             print("- Benefits:")
455 |             for benefit in benefits:
456 |                 print(f"  * {benefit}")
457 |             return
458 |             
459 |         panel = Panel(
460 |             Text.from_markup(
461 |                 f"[bold cyan]{feature_name}[/bold cyan]\n\n"
462 |                 f"{description}\n\n"
463 |                 f"[bold red]Standard Approach:[/bold red]\n"
464 |                 f"{standard_approach}\n\n"
465 |                 f"[bold green]Our Advanced Approach:[/bold green]\n"
466 |                 f"{our_approach}\n\n"
467 |                 f"[bold blue]Benefits:[/bold blue]"
468 |             ),
469 |             title="⭐ Advanced RAG Feature",
470 |             border_style="cyan",
471 |             padding=(1, 2)
472 |         )
473 |         
474 |         self.console.print(panel)
475 |         
476 |         # Print benefits as a bulleted list
477 |         for benefit in benefits:
478 |             self.console.print(f"  [bold blue]•[/bold blue] {benefit}")
479 |             
480 |     def create_metadata_tree(self, metadata: Dict[str, Any], title: str = "Metadata Structure") -> Tree:
481 |         """Create a tree visualization of metadata hierarchy.
482 |         
483 |         Args:
484 |             metadata: The metadata dictionary
485 |             title: Title for the tree
486 |             
487 |         Returns:
488 |             A Tree object representing the metadata structure
489 |         """
490 |         if not self.show_advanced_output:
491 |             return None
492 |             
493 |         tree = Tree(f"[bold]{title}[/bold]")
494 |         
495 |         def add_to_tree(node, data, parent_key=""):
496 |             if isinstance(data, dict):
497 |                 for key, value in data.items():
498 |                     display_key = f"[cyan]{key}[/cyan]"
499 |                     
500 |                     if isinstance(value, (dict, list)):
501 |                         branch = node.add(display_key)
502 |                         add_to_tree(branch, value, key)
503 |                     else:
504 |                         value_str = str(value)
505 |                         if len(value_str) > 50:
506 |                             value_str = value_str[:47] + "..."
507 |                             
508 |                         if key.endswith("_id") or key == "id":
509 |                             node.add(f"{display_key}: [blue]{value_str}[/blue]")
510 |                         elif "date" in key or "time" in key:
511 |                             node.add(f"{display_key}: [yellow]{value_str}[/yellow]")
512 |                         elif key == "source" or "source" in key:
513 |                             node.add(f"{display_key}: [green]{value_str}[/green]")
514 |                         else:
515 |                             node.add(f"{display_key}: {value_str}")
516 |             elif isinstance(data, list):
517 |                 if data and all(isinstance(x, dict) for x in data):
518 |                     # For lists of dictionaries, show only the first N items
519 |                     for i, item in enumerate(data[:3]):
520 |                         item_node = node.add(f"[magenta]Item {i}[/magenta]")
521 |                         add_to_tree(item_node, item)
522 |                     if len(data) > 3:
523 |                         node.add(f"[dim]... {len(data) - 3} more items ...[/dim]")
524 |                 else:
525 |                     # For simple lists or mixed content
526 |                     values = []
527 |                     for item in data[:5]:
528 |                         if isinstance(item, (dict, list)):
529 |                             item_type = type(item).__name__
530 |                             values.append(f"[{item_type}]")
531 |                         else:
532 |                             item_str = str(item)
533 |                             if len(item_str) > 30:
534 |                                 item_str = item_str[:27] + "..."
535 |                             values.append(item_str)
536 |                             
537 |                     display_value = ", ".join(values)
538 |                     if len(data) > 5:
539 |                         display_value += f", ... ({len(data) - 5} more)"
540 |                     node.add(f"[magenta]List[/magenta]: {display_value}")
541 |         
542 |         add_to_tree(tree, metadata)
543 |         return tree
544 |         
545 |     def create_tree(self, title: str) -> Tree:
546 |         """Create a simple tree with a title.
547 |         
548 |         Args:
549 |             title: Title for the tree
550 |             
551 |         Returns:
552 |             A Tree object with the given title
553 |         """
554 |         if not self.show_advanced_output:
555 |             return None
556 |             
557 |         return Tree(f"[bold]{title}[/bold]")
558 |         
559 |     def create_version_history_tree(self, version_info: Dict[str, Any], title: str = "Document Version History") -> Tree:
560 |         """Create a tree visualization of document version history.
561 |         
562 |         Args:
563 |             version_info: The version_info metadata dictionary
564 |             title: Title for the tree
565 |             
566 |         Returns:
567 |             A Tree object representing the version history
568 |         """
569 |         if not self.show_advanced_output:
570 |             return None
571 |             
572 |         tree = Tree(f"[bold]{title}[/bold]")
573 |         
574 |         current_version = version_info.get('version', '1.0')
575 |         created_at = version_info.get('created_at', 'unknown')
576 |         updated_at = version_info.get('updated_at', created_at)
577 |         
578 |         # Add current version
579 |         version_node = tree.add(f"[green]Current: v{current_version} ({updated_at})[/green]")
580 |         
581 |         # Add previous versions
582 |         previous_versions = version_info.get('previous_versions', [])
583 |         if previous_versions:
584 |             history_node = version_node.add("[yellow]Version History[/yellow]")
585 |             
586 |             for i, prev in enumerate(previous_versions):
587 |                 prev_version = prev.get('version', f"v{i}")
588 |                 prev_date = prev.get('updated_at', 'unknown')
589 |                 
590 |                 # Add details about this version
591 |                 version_entry = f"v{prev_version} ({prev_date})"
592 |                 if 'last_editor' in prev:
593 |                     version_entry += f" - {prev.get('last_editor')}"
594 |                     
595 |                 history_node.add(f"[blue]{version_entry}[/blue]")
596 |         else:
597 |             version_node.add("[dim]No previous versions[/dim]")
598 |             
599 |         return tree
600 |         
601 |     def display_comparison_table(self, comparisons: List[Dict[str, Any]], title: str = "Comparison"):
602 |         """Display a comparison table between different approaches.
603 |         
604 |         Args:
605 |             comparisons: List of dictionaries with comparison data
606 |             title: Title for the comparison table
607 |         """
608 |         if not self.show_advanced_output:
609 |             print(f"\n{title}:")
610 |             headers = comparisons[0].keys()
611 |             # Convert all values to strings
612 |             rows = [[str(value) for value in comp.values()] for comp in comparisons]
613 |             print(tabulate(rows, headers=headers, tablefmt="simple"))
614 |             return
615 |             
616 |         table = Table(title=f"🔍 {title}", box=box.ROUNDED)
617 |         
618 |         # Add columns from the first comparison item
619 |         if comparisons:
620 |             for key in comparisons[0].keys():
621 |                 style = "cyan" if key == "Feature" else None
622 |                 table.add_column(key, style=style)
623 |                 
624 |             # Add rows
625 |             for comp in comparisons:
626 |                 values = []
627 |                 for key, value in comp.items():
628 |                     # Apply styling based on column type
629 |                     if key == "Our Approach" or key == "Our Implementation":
630 |                         values.append(f"[green]{value}[/green]")
631 |                     elif key == "Basic Approach" or key == "Basic RAG":
632 |                         values.append(f"[red]{value}[/red]")
633 |                     else:
634 |                         # Convert any non-string values to strings
635 |                         values.append(str(value))
636 |                         
637 |                 table.add_row(*values)
638 |                 
639 |             self.console.print(table)
640 |     
641 |     def interactive_prompt(self, message: str, choices: Optional[List[str]] = None, 
642 |                           default: Optional[str] = None) -> str:
643 |         """Display an interactive prompt for user input.
644 |         
645 |         Args:
646 |             message: Prompt message
647 |             choices: Optional list of choices
648 |             default: Optional default value
649 |             
650 |         Returns:
651 |             User's response
652 |         """
653 |         if not self.show_advanced_output or not choices:
654 |             if default:
655 |                 user_input = input(f"{message} [{default}]: ") or default
656 |             else:
657 |                 user_input = input(f"{message}: ")
658 |             return user_input
659 |             
660 |         return Prompt.ask(
661 |             message,
662 |             choices=choices if choices else None,
663 |             default=default
664 |         )
665 |     
666 |     def confirm_prompt(self, question: str, default: bool = False) -> bool:
667 |         """Display a yes/no confirmation prompt.
668 |         
669 |         Args:
670 |             question: Question to ask
671 |             default: Default answer (True=Yes, False=No)
672 |             
673 |         Returns:
674 |             Boolean indicating user's response
675 |         """
676 |         if not self.show_advanced_output:
677 |             default_str = "Y/n" if default else "y/N"
678 |             response = input(f"{question} [{default_str}]: ").strip().lower()
679 |             
680 |             if not response:
681 |                 return default
682 |             
683 |             return response.startswith('y')
684 |             
685 |         return Confirm.ask(question, default=default)
686 | 
687 |     def start_spinner(self, message: str) -> None:
688 |         """Start a spinner with a message (simplified version).
689 |         
690 |         Args:
691 |             message: Message to display with the spinner
692 |         """
693 |         if not self.show_advanced_output:
694 |             print(f"{message}...")
695 |             return
696 |             
697 |         # In a real implementation, this would create and return a spinner object
698 |         # For simplicity, we're just printing the message
699 |         self.console.print(f"[bold cyan]{message}...[/bold cyan]")
700 | 
701 | # Instantiate a global display manager for use throughout the application
702 | display = DisplayManager(show_advanced_output=True, color_enabled=True)
703 | 
704 | # Sample usage function to demonstrate the display utilities
705 | def demo_display_utilities():
706 |     """Demo function to showcase the display utilities."""
707 |     # Header
708 |     display.print_header("RAG Document Processor", "Advanced Document Ingestion Demo")
709 |     
710 |     # Section
711 |     display.print_section("Document Loading", "Loading documents from Google Drive")
712 |     
713 |     # Step
714 |     display.print_step(1, 4, "Authenticating with Google Drive", "Setting up credentials and access")
715 |     
716 |     # Info messages
717 |     display.print_info("Using service account authentication")
718 |     display.print_success("Successfully authenticated with Google Drive")
719 |     
720 |     # File info
721 |     display.print_file_info(
722 |         "Important Document.pdf", 
723 |         "application/pdf", 
724 |         "1Ab3CdEfGhIjKlMnOpQrStUvWxYz", 
725 |         2048576
726 |     )
727 |     
728 |     # Progress with example data
729 |     with display.create_progress_bar(10, "Processing documents") as progress:
730 |         task = progress.add_task("Processing", total=10)
731 |         for i in range(10):
732 |             time.sleep(0.2)
733 |             progress.update(task, advance=1)
734 |             
735 |     # Display document stats
736 |     display.print_document_stats(
737 |         docs_count=5,
738 |         chunks_count=23,
739 |         metadata_fields=["file_name", "source", "created_time", "mime_type", 
740 |                        "page_number", "chunk", "embedding_model"]
741 |     )
742 |     
743 |     # Display embedding info
744 |     display.print_embedding_info(
745 |         model="text-embedding-ada-002",
746 |         vector_dimension=1536,
747 |         processing_time=3.45,
748 |         batch_size=20
749 |     )
750 |     
751 |     # Example metadata
752 |     example_metadata = {
753 |         "source": "google-drive://1Ab3CdEfGhIjKlMnOpQrStUvWxYz",
754 |         "file_name": "Important Document.pdf",
755 |         "mime_type": "application/pdf",
756 |         "created_time": "2023-05-15T10:23:45Z",
757 |         "modified_time": "2023-10-01T15:30:22Z",
758 |         "page_number": 3,
759 |         "chunk": 2,
760 |         "parent_id": "doc-1234",
761 |         "embedding_model": "text-embedding-ada-002",
762 |         "auth": {
763 |             "user": "john.doe@example.com",
764 |             "permissions": ["read", "write"]
765 |         },
766 |         "document_stats": {
767 |             "total_pages": 15,
768 |             "word_count": 3245,
769 |             "language": "en"
770 |         }
771 |     }
772 |     
773 |     # Display metadata example
774 |     display.print_metadata_example(example_metadata)
775 |     
776 |     # Display chunk example
777 |     display.print_chunk_example(
778 |         "This document outlines the procedures for handling sensitive information "
779 |         "according to the company's data protection policy. All employees must adhere "
780 |         "to these guidelines when processing customer data.",
781 |         example_metadata
782 |     )
783 |     
784 |     # Display vector storage info
785 |     display.print_vector_storage_info(
786 |         store_type="Supabase pgvector",
787 |         table_name="documents",
788 |         total_stored=23,
789 |         failed=0,
790 |         index_type="HNSW"
791 |     )
792 |     
793 |     # Display performance metrics
794 |     display.print_performance_metrics({
795 |         "Total Processing Time": 12.34,
796 |         "Documents per Second": 0.41,
797 |         "Chunks per Second": 1.87,
798 |         "Memory Usage (MB)": 156,
799 |         "API Calls": 8
800 |     })
801 |     
802 |     # Display advanced feature info
803 |     display.print_advanced_rag_feature(
804 |         feature_name="Hierarchical Metadata Extraction",
805 |         description="Extraction and preservation of document metadata throughout the RAG pipeline",
806 |         standard_approach="Basic extraction of filename and creation date only",
807 |         our_approach="Rich hierarchical metadata extraction including document structure, authorship, "
808 |                     "permissions, versioning, and context preservation across chunks",
809 |         benefits=[
810 |             "Enables advanced filtering and faceted search",
811 |             "Preserves document context for better retrieval",
812 |             "Supports detailed provenance tracking",
813 |             "Enables security-aware search results"
814 |         ]
815 |     )
816 |     
817 |     # Display metadata tree
818 |     tree = display.create_metadata_tree(example_metadata)
819 |     if tree:
820 |         display.console.print(tree)
821 |         
822 |     # Display comparison table
823 |     comparisons = [
824 |         {
825 |             "Feature": "Document Sources",
826 |             "Our Approach": "Multiple flexible loaders",
827 |             "Basic Approach": "Limited file types"
828 |         },
829 |         {
830 |             "Feature": "Metadata Extraction",
831 |             "Our Approach": "Rich, hierarchical with source tracking",
832 |             "Basic Approach": "Basic metadata or none"
833 |         },
834 |         {
835 |             "Feature": "Chunking Strategy",
836 |             "Our Approach": "Intelligent, content-aware with overlap",
837 |             "Basic Approach": "Simple fixed-size chunks"
838 |         },
839 |         {
840 |             "Feature": "Vector Storage",
841 |             "Our Approach": "Multiple options (Supabase pgvector, ChromaDB)",
842 |             "Basic Approach": "Limited options"
843 |         }
844 |     ]
845 |     
846 |     display.display_comparison_table(comparisons, "RAG Implementation Comparison")
847 |     
848 |     # Interactive prompt example
849 |     choice = display.interactive_prompt(
850 |         "Select a processing method",
851 |         choices=["basic", "advanced", "expert"],
852 |         default="advanced"
853 |     )
854 |     display.print_info(f"Selected: {choice}")
855 |     
856 |     # Confirmation prompt
857 |     confirmed = display.confirm_prompt("Would you like to process all documents?", default=True)
858 |     display.print_info(f"Process all: {'Yes' if confirmed else 'No'}")
859 | 
860 | if __name__ == "__main__":
861 |     # Run the demo if script is executed directly
862 |     demo_display_utilities()


--------------------------------------------------------------------------------