├── LICENSE.md
├── README.md
├── basic_usage.py
└── logger.py


/LICENSE.md:
--------------------------------------------------------------------------------
1 | MIT License
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # RAG Logger
  2 | 
  3 | RAG Logger is an open-source logging tool designed specifically for Retrieval-Augmented Generation (RAG) applications. It serves as a lightweight, open-source alternative to LangSmith, focusing on RAG-specific logging needs.
  4 | 
  5 | ## Features
  6 | 
  7 | - 📊 **Comprehensive RAG Pipeline Logging**
  8 |   - Query tracking
  9 |   - Retrieval results logging
 10 |   - LLM interaction recording
 11 |   - Step-by-step performance monitoring
 12 | 
 13 | - 💾 **Structured Storage**
 14 |   - JSON-based log format
 15 |   - Daily log organization
 16 |   - Automatic file management
 17 |   - Metadata enrichment
 18 | 
 19 | ## Quick Start
 20 | ```python
 21 | from logger import RAGLogger
 22 | 
 23 | # Initialize logger
 24 | logger = RAGLogger(log_dir="logs")
 25 | 
 26 | # Log a query
 27 | logger.log_query("What is machine learning?")
 28 | 
 29 | # Track retrieval step
 30 | logger.start_step("retrieval")
 31 | logger.log_retrieval(
 32 |     source="text",
 33 |     total_docs=100,
 34 |     retrieved_docs=[{"id": 1, "content": "..."}]
 35 | )
 36 | logger.end_step("retrieval")
 37 | 
 38 | # Record LLM interaction
 39 | logger.log_llm(
 40 |     llm_input="User query and context",
 41 |     llm_output="Generated response"
 42 | )
 43 | 
 44 | # Save logs
 45 | logger.save()
 46 | ```
 47 |  
 48 | ## Log Structure
 49 | ```json
 50 | {
 51 |     "timestamp": "2024-03-20 10:00:00",
 52 |     "query": "What is machine learning?",
 53 |     "total_time": 8.5,
 54 |     "steps": {
 55 |         "query_understanding": {
 56 |             "name": "query_understanding",
 57 |             "start_time": 1234567890.0,
 58 |             "end_time": 1234567891.0,
 59 |             "duration": 1.0,
 60 |             "metadata": {
 61 |                 "detected_intent": "definition_query",
 62 |                 "topic": "machine_learning",
 63 |                 "confidence": 0.95
 64 |             }
 65 |         },
 66 |         "text_embedding": {
 67 |             "name": "text_embedding", 
 68 |             "start_time": 1234567891.0,
 69 |             "end_time": 1234567892.5,
 70 |             "duration": 1.5,
 71 |             "metadata": {
 72 |                 "model": "text-embedding-3-small",
 73 |                 "embedding_dim": 1536,
 74 |                 "batch_size": 32
 75 |             }
 76 |         },
 77 |         "text_retrieval": {
 78 |             "name": "text_retrieval",
 79 |             "start_time": 1234567892.5,
 80 |             "end_time": 1234567894.0,
 81 |             "duration": 1.5,
 82 |             "metadata": {
 83 |                 "index_type": "faiss",
 84 |                 "top_k": 5,
 85 |                 "similarity_threshold": 0.7
 86 |             }
 87 |         },
 88 |         "llm_generation": {
 89 |             "name": "llm_generation",
 90 |             "start_time": 1234567894.0,
 91 |             "end_time": 1234567898.5,
 92 |             "duration": 4.5,
 93 |             "metadata": {
 94 |                 "model": "gpt-4o",
 95 |                 "max_tokens": 1024,
 96 |                 "temperature": 0.7
 97 |             }
 98 |         }
 99 |     },
100 |     "retrieval_results": {
101 |         "text": {
102 |             "total_docs": 1000,
103 |             "retrieved_docs": [
104 |                 {
105 |                     "id": "doc_123",
106 |                     "book": "Introduction to Machine Learning",
107 |                     "chapter": "Chapter 1: Overview",
108 |                     "content": "Machine learning is a core field of artificial intelligence...",
109 |                     "similarity_score": 0.92,
110 |                     "metadata": {
111 |                         "page": 12,
112 |                         "last_updated": "2024-01-01"
113 |                     }
114 |                 }
115 |             ],
116 |             "metadata": {
117 |                 "index_size": "2.5GB",
118 |                 "last_updated": "2024-03-19"
119 |             }
120 |         }
121 |     },
122 |     "llm_input": {
123 |         "query": "What is machine learning?",
124 |         "context": "...(retrieved text contents)",
125 |         "system_prompt": "You are a professional educational assistant...",
126 |         "metadata": {
127 |             "max_context_length": 4096,
128 |             "format": "markdown"
129 |         }
130 |     },
131 |     "llm_output": {
132 |         "content": "Machine learning is a key branch of artificial intelligence...",
133 |         "metadata": {
134 |             "token_count": 512,
135 |             "generation_time": 4.5
136 |         }
137 |     },
138 |     "messages": [
139 |         {
140 |             "timestamp": "2024-03-20 10:00:00",
141 |             "level": "INFO",
142 |             "step": "query_understanding",
143 |             "message": "Successfully identified query intent: definition_query"
144 |         },
145 |         {
146 |             "timestamp": "2024-03-20 10:00:01",
147 |             "level": "INFO", 
148 |             "step": "text_retrieval",
149 |             "message": "Retrieved 5 relevant documents from 1000 total documents"
150 |         },
151 |         {
152 |             "timestamp": "2024-03-20 10:00:03",
153 |             "level": "INFO",
154 |             "step": "llm_generation",
155 |             "message": "Response generation completed, tokens: 512"
156 |         }
157 |     ],
158 |     "error_tracking": {
159 |         "has_errors": false,
160 |         "error_count": 0,
161 |         "warnings": []
162 |     }
163 | }
164 | ```
165 | 
166 | ## 🚀 Roadmap
167 | 
168 | ### Phase 1: Core Enhancement (In Progress)
169 | - [ ] Add decorator support for automatic logging
170 | - [ ] Implement configuration management
171 | - [ ] Add performance analysis features
172 | - [ ] Integrate with common LLM providers
173 | 
174 | ### Phase 2: Version Control & Migration (Planned)
175 | - [ ] Implement prompt version management (similar to alembic)
176 | - [ ] Add migration system for prompts and configurations
177 | - [ ] Integrate with Git for change tracking
178 | - [ ] Create CLI tools for version management
179 | 
180 | ### Phase 3: Web Interface (Planned)
181 | - [ ] Build web dashboard for log visualization
182 | - [ ] Add prompt version management UI
183 | - [ ] Implement performance analysis views
184 | - [ ] Create A/B testing comparison tools
185 | 
186 | ### Phase 4: Advanced Features (Future)
187 | - [ ] Add evaluation metrics
188 | - [ ] Implement vector store integration
189 | - [ ] Add support for distributed logging
190 | - [ ] Create export/import functionality
191 | 


--------------------------------------------------------------------------------
/basic_usage.py:
--------------------------------------------------------------------------------
 1 | from logger import RAGLogger
 2 | 
 3 | def simulate_rag_process():
 4 |     # Initialize logger
 5 |     logger = RAGLogger(log_dir="logs", auto_save=True)
 6 |     
 7 |     try:
 8 |         # Log query
 9 |         query = "What is machine learning?"
10 |         logger.log_query(query)
11 |         
12 |         # Log text retrieval step
13 |         logger.start_step("text_retrieval")
14 |         # Simulate text retrieval process
15 |         text_docs = [
16 |             {"id": 1, "content": "Machine learning is a subfield of artificial intelligence"},
17 |             {"id": 2, "content": "Machine learning trains models using data"}
18 |         ]
19 |         logger.log_retrieval("text", total_docs=100, retrieved_docs=text_docs)
20 |         logger.end_step("text_retrieval")
21 |         
22 |         # Log image retrieval step
23 |         logger.start_step("image_retrieval")
24 |         # Simulate image retrieval process
25 |         image_docs = [
26 |             {"id": 1, "path": "ml_diagram.png"},
27 |             {"id": 2, "path": "neural_network.png"}
28 |         ]
29 |         logger.log_retrieval("image", total_docs=50, retrieved_docs=image_docs)
30 |         logger.end_step("image_retrieval")
31 |         
32 |         # Log LLM generation step
33 |         logger.start_step("llm_generation")
34 |         llm_input = f"Query: {query}\nContext: {text_docs}"
35 |         llm_output = "Machine learning is a method that improves system performance through data training..."
36 |         logger.log_llm(llm_input, llm_output)
37 |         logger.end_step("llm_generation")
38 |         
39 |         # Manually save log (if auto-save is disabled)
40 |         if not logger.auto_save:
41 |             logger.save()
42 |             
43 |     except Exception as e:
44 |         logger.error(f"Error during processing: {str(e)}")
45 |         raise
46 | 
47 | if __name__ == "__main__":
48 |     simulate_rag_process()
49 | 


--------------------------------------------------------------------------------
/logger.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import time
  3 | from datetime import datetime
  4 | import os
  5 | from typing import Any, Dict, List, Optional
  6 | from dataclasses import dataclass, asdict
  7 | 
  8 | @dataclass
  9 | class RAGStep:
 10 |     """Data class for recording RAG processing steps"""
 11 |     name: str
 12 |     start_time: float = 0.0
 13 |     end_time: float = 0.0
 14 |     duration: float = 0.0
 15 |     metadata: Dict[str, Any] = None
 16 | 
 17 | @dataclass
 18 | class RetrievalResult:
 19 |     """Data class for retrieval results"""
 20 |     total_docs: int = 0
 21 |     retrieved_docs: List[Dict] = None
 22 |     metadata: Dict[str, Any] = None
 23 | 
 24 | @dataclass
 25 | class RAGLogData:
 26 |     """Data class for RAG log data"""
 27 |     timestamp: str
 28 |     query: str
 29 |     total_time: float = 0.0
 30 |     steps: Dict[str, RAGStep] = None
 31 |     retrieval_results: Dict[str, RetrievalResult] = None
 32 |     llm_input: str = ""
 33 |     llm_output: str = ""
 34 |     messages: List[Dict] = None
 35 | 
 36 | class RAGLogger:
 37 |     """Logger for RAG (Retrieval-Augmented Generation) scenarios"""
 38 |     
 39 |     def __init__(self, log_dir: str = "logs", auto_save: bool = True):
 40 |         """
 41 |         Initialize RAG logger
 42 |         
 43 |         Args:
 44 |             log_dir: Directory for storing logs
 45 |             auto_save: Whether to automatically save logs (when logging ends)
 46 |         """
 47 |         self.log_dir = log_dir
 48 |         self.auto_save = auto_save
 49 |         self.start_time = time.time()
 50 |         
 51 |         # Create log directory structure
 52 |         self.today = datetime.now().strftime("%Y%m%d")
 53 |         self.daily_log_dir = os.path.join(self.log_dir, self.today)
 54 |         os.makedirs(self.daily_log_dir, exist_ok=True)
 55 |         
 56 |         # Initialize log data
 57 |         self.step_times = {}
 58 |         self.log_data = RAGLogData(
 59 |             timestamp=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
 60 |             query="",
 61 |             steps={},
 62 |             retrieval_results={},
 63 |             messages=[]
 64 |         )
 65 |         
 66 |         self.info("RAG Logger initialized successfully")
 67 | 
 68 |     def start_step(self, step_name: str, metadata: Dict[str, Any] = None) -> None:
 69 |         """
 70 |         Start recording a processing step
 71 |         
 72 |         Args:
 73 |             step_name: Name of the step
 74 |             metadata: Metadata related to the step
 75 |         """
 76 |         self.step_times[step_name] = time.time()
 77 |         self.log_data.steps[step_name] = RAGStep(
 78 |             name=step_name,
 79 |             start_time=self.step_times[step_name],
 80 |             metadata=metadata
 81 |         )
 82 |         print(f"[{step_name}] Started...")
 83 | 
 84 |     def end_step(self, step_name: str, metadata: Dict[str, Any] = None) -> None:
 85 |         """
 86 |         End recording a processing step
 87 |         
 88 |         Args:
 89 |             step_name: Name of the step
 90 |             metadata: Metadata related to the step
 91 |         """
 92 |         if step_name in self.step_times:
 93 |             end_time = time.time()
 94 |             duration = end_time - self.step_times[step_name]
 95 |             
 96 |             step = self.log_data.steps.get(step_name)
 97 |             if step:
 98 |                 step.end_time = end_time
 99 |                 step.duration = duration
100 |                 if metadata:
101 |                     step.metadata = metadata if not step.metadata else {**step.metadata, **metadata}
102 |                     
103 |             print(f"[{step_name}] Completed (Duration: {duration:.2f}s)")
104 | 
105 |     def log_retrieval(self, 
106 |                      source: str,
107 |                      total_docs: int,
108 |                      retrieved_docs: List[Dict],
109 |                      metadata: Dict[str, Any] = None) -> None:
110 |         """
111 |         Log retrieval results
112 |         
113 |         Args:
114 |             source: Retrieval source (e.g., 'text', 'image')
115 |             total_docs: Total number of documents
116 |             retrieved_docs: List of retrieved documents
117 |             metadata: Metadata related to retrieval
118 |         """
119 |         self.log_data.retrieval_results[source] = RetrievalResult(
120 |             total_docs=total_docs,
121 |             retrieved_docs=retrieved_docs,
122 |             metadata=metadata
123 |         )
124 |         print(f"[{source} Retrieval] Retrieved {len(retrieved_docs)} results from {total_docs} documents")
125 | 
126 |     def log_llm(self, llm_input: str, llm_output: str) -> None:
127 |         """
128 |         Log LLM interaction
129 |         
130 |         Args:
131 |             llm_input: Input content to LLM
132 |             llm_output: Output content from LLM
133 |         """
134 |         self.log_data.llm_input = llm_input
135 |         self.log_data.llm_output = llm_output
136 |         print(f"[LLM] Generated response (Length: {len(llm_output)})")
137 | 
138 |     def log_query(self, query: str) -> None:
139 |         """
140 |         Log query content
141 |         
142 |         Args:
143 |             query: User query content
144 |         """
145 |         self.log_data.query = query
146 |         print(f"[Query] {query}")
147 | 
148 |     def info(self, message: str) -> None:
149 |         """Log information level message"""
150 |         self._log_message("INFO", message)
151 | 
152 |     def warning(self, message: str) -> None:
153 |         """Log warning level message"""
154 |         self._log_message("WARNING", message)
155 | 
156 |     def error(self, message: str) -> None:
157 |         """Log error level message"""
158 |         self._log_message("ERROR", message)
159 | 
160 |     def _log_message(self, level: str, message: str) -> None:
161 |         """Internal method for logging messages"""
162 |         print(f"[{level}] {message}")
163 |         if not self.log_data.messages:
164 |             self.log_data.messages = []
165 |         self.log_data.messages.append({
166 |             "level": level,
167 |             "message": message,
168 |             "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
169 |         })
170 | 
171 |     def save(self, filename_prefix: str = "rag_log") -> str:
172 |         """
173 |         Save log to file
174 |         
175 |         Args:
176 |             filename_prefix: Prefix for log filename
177 |             
178 |         Returns:
179 |             str: Path to saved log file
180 |         """
181 |         self.log_data.total_time = time.time() - self.start_time
182 |         
183 |         # Generate filename
184 |         timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
185 |         filename = f"{filename_prefix}_{timestamp}.json"
186 |         filepath = os.path.join(self.daily_log_dir, filename)
187 |         
188 |         # Convert log data to dictionary
189 |         log_dict = asdict(self.log_data)
190 |         
191 |         # Save to file
192 |         with open(filepath, 'w', encoding='utf-8') as f:
193 |             json.dump(log_dict, f, ensure_ascii=False, indent=2)
194 |         
195 |         print(f"Log saved to: {filepath}")
196 |         return filepath
197 | 
198 |     def __del__(self):
199 |         """Destructor - save log if auto_save is enabled"""
200 |         if self.auto_save:
201 |             self.save()
202 | 


--------------------------------------------------------------------------------