├── LICENSE.md ├── README.md ├── basic_usage.py └── logger.py /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RAG Logger 2 | 3 | RAG Logger is an open-source logging tool designed specifically for Retrieval-Augmented Generation (RAG) applications. It serves as a lightweight, open-source alternative to LangSmith, focusing on RAG-specific logging needs. 4 | 5 | ## Features 6 | 7 | - 📊 **Comprehensive RAG Pipeline Logging** 8 | - Query tracking 9 | - Retrieval results logging 10 | - LLM interaction recording 11 | - Step-by-step performance monitoring 12 | 13 | - 💾 **Structured Storage** 14 | - JSON-based log format 15 | - Daily log organization 16 | - Automatic file management 17 | - Metadata enrichment 18 | 19 | ## Quick Start 20 | ```python 21 | from logger import RAGLogger 22 | 23 | # Initialize logger 24 | logger = RAGLogger(log_dir="logs") 25 | 26 | # Log a query 27 | logger.log_query("What is machine learning?") 28 | 29 | # Track retrieval step 30 | logger.start_step("retrieval") 31 | logger.log_retrieval( 32 | source="text", 33 | total_docs=100, 34 | retrieved_docs=[{"id": 1, "content": "..."}] 35 | ) 36 | logger.end_step("retrieval") 37 | 38 | # Record LLM interaction 39 | logger.log_llm( 40 | llm_input="User query and context", 41 | llm_output="Generated response" 42 | ) 43 | 44 | # Save logs 45 | logger.save() 46 | ``` 47 | 48 | ## Log Structure 49 | ```json 50 | { 51 | "timestamp": "2024-03-20 10:00:00", 52 | "query": "What is machine learning?", 53 | "total_time": 8.5, 54 | "steps": { 55 | "query_understanding": { 56 | "name": "query_understanding", 57 | "start_time": 1234567890.0, 58 | "end_time": 1234567891.0, 59 | "duration": 1.0, 60 | "metadata": { 61 | "detected_intent": "definition_query", 62 | "topic": "machine_learning", 63 | "confidence": 0.95 64 | } 65 | }, 66 | "text_embedding": { 67 | "name": "text_embedding", 68 | "start_time": 1234567891.0, 69 | "end_time": 1234567892.5, 70 | "duration": 1.5, 71 | "metadata": { 72 | "model": "text-embedding-3-small", 73 | "embedding_dim": 1536, 74 | "batch_size": 32 75 | } 76 | }, 77 | "text_retrieval": { 78 | "name": "text_retrieval", 79 | "start_time": 1234567892.5, 80 | "end_time": 1234567894.0, 81 | "duration": 1.5, 82 | "metadata": { 83 | "index_type": "faiss", 84 | "top_k": 5, 85 | "similarity_threshold": 0.7 86 | } 87 | }, 88 | "llm_generation": { 89 | "name": "llm_generation", 90 | "start_time": 1234567894.0, 91 | "end_time": 1234567898.5, 92 | "duration": 4.5, 93 | "metadata": { 94 | "model": "gpt-4o", 95 | "max_tokens": 1024, 96 | "temperature": 0.7 97 | } 98 | } 99 | }, 100 | "retrieval_results": { 101 | "text": { 102 | "total_docs": 1000, 103 | "retrieved_docs": [ 104 | { 105 | "id": "doc_123", 106 | "book": "Introduction to Machine Learning", 107 | "chapter": "Chapter 1: Overview", 108 | "content": "Machine learning is a core field of artificial intelligence...", 109 | "similarity_score": 0.92, 110 | "metadata": { 111 | "page": 12, 112 | "last_updated": "2024-01-01" 113 | } 114 | } 115 | ], 116 | "metadata": { 117 | "index_size": "2.5GB", 118 | "last_updated": "2024-03-19" 119 | } 120 | } 121 | }, 122 | "llm_input": { 123 | "query": "What is machine learning?", 124 | "context": "...(retrieved text contents)", 125 | "system_prompt": "You are a professional educational assistant...", 126 | "metadata": { 127 | "max_context_length": 4096, 128 | "format": "markdown" 129 | } 130 | }, 131 | "llm_output": { 132 | "content": "Machine learning is a key branch of artificial intelligence...", 133 | "metadata": { 134 | "token_count": 512, 135 | "generation_time": 4.5 136 | } 137 | }, 138 | "messages": [ 139 | { 140 | "timestamp": "2024-03-20 10:00:00", 141 | "level": "INFO", 142 | "step": "query_understanding", 143 | "message": "Successfully identified query intent: definition_query" 144 | }, 145 | { 146 | "timestamp": "2024-03-20 10:00:01", 147 | "level": "INFO", 148 | "step": "text_retrieval", 149 | "message": "Retrieved 5 relevant documents from 1000 total documents" 150 | }, 151 | { 152 | "timestamp": "2024-03-20 10:00:03", 153 | "level": "INFO", 154 | "step": "llm_generation", 155 | "message": "Response generation completed, tokens: 512" 156 | } 157 | ], 158 | "error_tracking": { 159 | "has_errors": false, 160 | "error_count": 0, 161 | "warnings": [] 162 | } 163 | } 164 | ``` 165 | 166 | ## 🚀 Roadmap 167 | 168 | ### Phase 1: Core Enhancement (In Progress) 169 | - [ ] Add decorator support for automatic logging 170 | - [ ] Implement configuration management 171 | - [ ] Add performance analysis features 172 | - [ ] Integrate with common LLM providers 173 | 174 | ### Phase 2: Version Control & Migration (Planned) 175 | - [ ] Implement prompt version management (similar to alembic) 176 | - [ ] Add migration system for prompts and configurations 177 | - [ ] Integrate with Git for change tracking 178 | - [ ] Create CLI tools for version management 179 | 180 | ### Phase 3: Web Interface (Planned) 181 | - [ ] Build web dashboard for log visualization 182 | - [ ] Add prompt version management UI 183 | - [ ] Implement performance analysis views 184 | - [ ] Create A/B testing comparison tools 185 | 186 | ### Phase 4: Advanced Features (Future) 187 | - [ ] Add evaluation metrics 188 | - [ ] Implement vector store integration 189 | - [ ] Add support for distributed logging 190 | - [ ] Create export/import functionality 191 | -------------------------------------------------------------------------------- /basic_usage.py: -------------------------------------------------------------------------------- 1 | from logger import RAGLogger 2 | 3 | def simulate_rag_process(): 4 | # Initialize logger 5 | logger = RAGLogger(log_dir="logs", auto_save=True) 6 | 7 | try: 8 | # Log query 9 | query = "What is machine learning?" 10 | logger.log_query(query) 11 | 12 | # Log text retrieval step 13 | logger.start_step("text_retrieval") 14 | # Simulate text retrieval process 15 | text_docs = [ 16 | {"id": 1, "content": "Machine learning is a subfield of artificial intelligence"}, 17 | {"id": 2, "content": "Machine learning trains models using data"} 18 | ] 19 | logger.log_retrieval("text", total_docs=100, retrieved_docs=text_docs) 20 | logger.end_step("text_retrieval") 21 | 22 | # Log image retrieval step 23 | logger.start_step("image_retrieval") 24 | # Simulate image retrieval process 25 | image_docs = [ 26 | {"id": 1, "path": "ml_diagram.png"}, 27 | {"id": 2, "path": "neural_network.png"} 28 | ] 29 | logger.log_retrieval("image", total_docs=50, retrieved_docs=image_docs) 30 | logger.end_step("image_retrieval") 31 | 32 | # Log LLM generation step 33 | logger.start_step("llm_generation") 34 | llm_input = f"Query: {query}\nContext: {text_docs}" 35 | llm_output = "Machine learning is a method that improves system performance through data training..." 36 | logger.log_llm(llm_input, llm_output) 37 | logger.end_step("llm_generation") 38 | 39 | # Manually save log (if auto-save is disabled) 40 | if not logger.auto_save: 41 | logger.save() 42 | 43 | except Exception as e: 44 | logger.error(f"Error during processing: {str(e)}") 45 | raise 46 | 47 | if __name__ == "__main__": 48 | simulate_rag_process() 49 | -------------------------------------------------------------------------------- /logger.py: -------------------------------------------------------------------------------- 1 | import json 2 | import time 3 | from datetime import datetime 4 | import os 5 | from typing import Any, Dict, List, Optional 6 | from dataclasses import dataclass, asdict 7 | 8 | @dataclass 9 | class RAGStep: 10 | """Data class for recording RAG processing steps""" 11 | name: str 12 | start_time: float = 0.0 13 | end_time: float = 0.0 14 | duration: float = 0.0 15 | metadata: Dict[str, Any] = None 16 | 17 | @dataclass 18 | class RetrievalResult: 19 | """Data class for retrieval results""" 20 | total_docs: int = 0 21 | retrieved_docs: List[Dict] = None 22 | metadata: Dict[str, Any] = None 23 | 24 | @dataclass 25 | class RAGLogData: 26 | """Data class for RAG log data""" 27 | timestamp: str 28 | query: str 29 | total_time: float = 0.0 30 | steps: Dict[str, RAGStep] = None 31 | retrieval_results: Dict[str, RetrievalResult] = None 32 | llm_input: str = "" 33 | llm_output: str = "" 34 | messages: List[Dict] = None 35 | 36 | class RAGLogger: 37 | """Logger for RAG (Retrieval-Augmented Generation) scenarios""" 38 | 39 | def __init__(self, log_dir: str = "logs", auto_save: bool = True): 40 | """ 41 | Initialize RAG logger 42 | 43 | Args: 44 | log_dir: Directory for storing logs 45 | auto_save: Whether to automatically save logs (when logging ends) 46 | """ 47 | self.log_dir = log_dir 48 | self.auto_save = auto_save 49 | self.start_time = time.time() 50 | 51 | # Create log directory structure 52 | self.today = datetime.now().strftime("%Y%m%d") 53 | self.daily_log_dir = os.path.join(self.log_dir, self.today) 54 | os.makedirs(self.daily_log_dir, exist_ok=True) 55 | 56 | # Initialize log data 57 | self.step_times = {} 58 | self.log_data = RAGLogData( 59 | timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S"), 60 | query="", 61 | steps={}, 62 | retrieval_results={}, 63 | messages=[] 64 | ) 65 | 66 | self.info("RAG Logger initialized successfully") 67 | 68 | def start_step(self, step_name: str, metadata: Dict[str, Any] = None) -> None: 69 | """ 70 | Start recording a processing step 71 | 72 | Args: 73 | step_name: Name of the step 74 | metadata: Metadata related to the step 75 | """ 76 | self.step_times[step_name] = time.time() 77 | self.log_data.steps[step_name] = RAGStep( 78 | name=step_name, 79 | start_time=self.step_times[step_name], 80 | metadata=metadata 81 | ) 82 | print(f"[{step_name}] Started...") 83 | 84 | def end_step(self, step_name: str, metadata: Dict[str, Any] = None) -> None: 85 | """ 86 | End recording a processing step 87 | 88 | Args: 89 | step_name: Name of the step 90 | metadata: Metadata related to the step 91 | """ 92 | if step_name in self.step_times: 93 | end_time = time.time() 94 | duration = end_time - self.step_times[step_name] 95 | 96 | step = self.log_data.steps.get(step_name) 97 | if step: 98 | step.end_time = end_time 99 | step.duration = duration 100 | if metadata: 101 | step.metadata = metadata if not step.metadata else {**step.metadata, **metadata} 102 | 103 | print(f"[{step_name}] Completed (Duration: {duration:.2f}s)") 104 | 105 | def log_retrieval(self, 106 | source: str, 107 | total_docs: int, 108 | retrieved_docs: List[Dict], 109 | metadata: Dict[str, Any] = None) -> None: 110 | """ 111 | Log retrieval results 112 | 113 | Args: 114 | source: Retrieval source (e.g., 'text', 'image') 115 | total_docs: Total number of documents 116 | retrieved_docs: List of retrieved documents 117 | metadata: Metadata related to retrieval 118 | """ 119 | self.log_data.retrieval_results[source] = RetrievalResult( 120 | total_docs=total_docs, 121 | retrieved_docs=retrieved_docs, 122 | metadata=metadata 123 | ) 124 | print(f"[{source} Retrieval] Retrieved {len(retrieved_docs)} results from {total_docs} documents") 125 | 126 | def log_llm(self, llm_input: str, llm_output: str) -> None: 127 | """ 128 | Log LLM interaction 129 | 130 | Args: 131 | llm_input: Input content to LLM 132 | llm_output: Output content from LLM 133 | """ 134 | self.log_data.llm_input = llm_input 135 | self.log_data.llm_output = llm_output 136 | print(f"[LLM] Generated response (Length: {len(llm_output)})") 137 | 138 | def log_query(self, query: str) -> None: 139 | """ 140 | Log query content 141 | 142 | Args: 143 | query: User query content 144 | """ 145 | self.log_data.query = query 146 | print(f"[Query] {query}") 147 | 148 | def info(self, message: str) -> None: 149 | """Log information level message""" 150 | self._log_message("INFO", message) 151 | 152 | def warning(self, message: str) -> None: 153 | """Log warning level message""" 154 | self._log_message("WARNING", message) 155 | 156 | def error(self, message: str) -> None: 157 | """Log error level message""" 158 | self._log_message("ERROR", message) 159 | 160 | def _log_message(self, level: str, message: str) -> None: 161 | """Internal method for logging messages""" 162 | print(f"[{level}] {message}") 163 | if not self.log_data.messages: 164 | self.log_data.messages = [] 165 | self.log_data.messages.append({ 166 | "level": level, 167 | "message": message, 168 | "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S") 169 | }) 170 | 171 | def save(self, filename_prefix: str = "rag_log") -> str: 172 | """ 173 | Save log to file 174 | 175 | Args: 176 | filename_prefix: Prefix for log filename 177 | 178 | Returns: 179 | str: Path to saved log file 180 | """ 181 | self.log_data.total_time = time.time() - self.start_time 182 | 183 | # Generate filename 184 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") 185 | filename = f"{filename_prefix}_{timestamp}.json" 186 | filepath = os.path.join(self.daily_log_dir, filename) 187 | 188 | # Convert log data to dictionary 189 | log_dict = asdict(self.log_data) 190 | 191 | # Save to file 192 | with open(filepath, 'w', encoding='utf-8') as f: 193 | json.dump(log_dict, f, ensure_ascii=False, indent=2) 194 | 195 | print(f"Log saved to: {filepath}") 196 | return filepath 197 | 198 | def __del__(self): 199 | """Destructor - save log if auto_save is enabled""" 200 | if self.auto_save: 201 | self.save() 202 | --------------------------------------------------------------------------------