├── .env ├── config.py ├── requirements.txt ├── docker-compose.yml ├── .gitignore ├── Output_Lesson_Plans └── MCQs_energy.json ├── readme.md ├── qdrant_connector.py ├── embedding_model.py ├── process_improved_data.py └── lesson_plan_generator.py /.env: -------------------------------------------------------------------------------- 1 | OPENAI_API_KEY= 2 | QDRANT_URL=http://localhost:6333 3 | QDRANT_HOST=localhost 4 | QDRANT_PORT=6333 5 | QDRANT_COLLECTION_NAME=science_9_collection 6 | QDRANT_VECTOR_SIZE=2560 7 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | # Default configuration for Qdrant Vector Database 2 | QDRANT_COLLECTION_NAME = "science_9_collection" 3 | QDRANT_VECTOR_SIZE = 4096 4 | QDRANT_HOST = "localhost" 5 | QDRANT_PORT = 6333 6 | 7 | # Default batch size for processing 8 | BATCH_SIZE = 10 9 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # Core dependencies 2 | numpy>=1.20.0 3 | pandas>=1.3.0 4 | matplotlib>=3.4.0 5 | torch>=2.0.0 6 | transformers>=4.30.0 7 | httpx>=0.24.0 8 | 9 | # Embedding models 10 | transformers>=4.51.0 11 | sentence-transformers>=2.7.0 12 | 13 | # Vector database 14 | qdrant-client>=1.7.0 15 | 16 | # LangChain for RAG 17 | langchain>=0.1.0 18 | langchain-community>=0.0.10 19 | langchain-core>=0.1.0 20 | 21 | # Utils 22 | tqdm>=4.65.0 23 | python-dotenv>=1.0.0 24 | pydantic>=2.0.0 25 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | # EduPlan AI - Qdrant Vector Database 2 | services: 3 | qdrant: 4 | image: qdrant/qdrant:latest 5 | ports: 6 | - "6333:6333" 7 | - "6334:6334" 8 | volumes: 9 | - qdrant-storage:/qdrant/storage # Using named volume for FUSE compatibility 10 | environment: 11 | - QDRANT__STORAGE__WAL_CAPACITY_MB=32 12 | - QDRANT__STORAGE__WAL_SEGMENTS_AHEAD=0 13 | restart: unless-stopped 14 | volumes: 15 | qdrant-storage: 16 | driver: local 17 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | *.so 6 | .Python 7 | build/ 8 | develop-eggs/ 9 | dist/ 10 | downloads/ 11 | eggs/ 12 | .eggs/ 13 | lib/ 14 | lib64/ 15 | parts/ 16 | sdist/ 17 | var/ 18 | wheels/ 19 | pip-wheel-metadata/ 20 | share/python-wheels/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | MANIFEST 25 | 26 | # Virtual environments 27 | env/ 28 | venv/ 29 | ENV/ 30 | env.bak/ 31 | venv.bak/ 32 | eduplan_env/ 33 | 34 | # IDEs 35 | .vscode/ 36 | .env 37 | .idea/ 38 | *.swp 39 | *.swo 40 | 41 | # Project specific 42 | extracted_data/ 43 | rag_data/ 44 | *.log 45 | .env 46 | 47 | # Qdrant data 48 | # qdrant_storage/ 49 | qdrant_storage/collections/midjourney 50 | qdrant_storage/collections/test_collection 51 | -------------------------------------------------------------------------------- /Output_Lesson_Plans/MCQs_energy.json: -------------------------------------------------------------------------------- 1 | { 2 | "mcqs": [ 3 | { 4 | "question": "Which of the following is NOT a form of energy mentioned in the provided curriculum content?", 5 | "options": { 6 | "A": "Mechanical energy", 7 | "B": "Nuclear energy", 8 | "C": "Heat energy", 9 | "D": "Chemical energy" 10 | }, 11 | "correct_answer": "Nuclear energy", 12 | "explanation": "The provided curriculum content lists mechanical, heat, chemical, electrical, and light energy, but not nuclear energy." 13 | }, 14 | { 15 | "question": "According to the curriculum content, what is the biggest natural source of energy for us?", 16 | "options": { 17 | "A": "The Earth’s core", 18 | "B": "The Sun", 19 | "C": "The Moon", 20 | "D": "Nuclear reactions" 21 | }, 22 | "correct_answer": "The Sun", 23 | "explanation": "The curriculum content states that the Sun is the biggest natural source of energy for us." 24 | }, 25 | { 26 | "question": "A lamp consumes 1000 J of electrical energy in 10 seconds. What is its power?", 27 | "options": { 28 | "A": "10 W", 29 | "B": "100 W", 30 | "C": "1000 W", 31 | "D": "10000 W" 32 | }, 33 | "correct_answer": "100 W", 34 | "explanation": "Power is calculated as energy divided by time. Therefore, 1000 J / 10 s = 100 W." 35 | }, 36 | { 37 | "question": "Which of the following energy sources is NOT directly derived from the Sun?", 38 | "options": { 39 | "A": "Solar energy", 40 | "B": "Wind energy", 41 | "C": "Geothermal energy", 42 | "D": "Hydroelectric energy" 43 | }, 44 | "correct_answer": "Geothermal energy", 45 | "explanation": "Geothermal energy comes from the Earth's interior, not directly from the Sun, unlike solar, wind, and hydroelectric energy." 46 | }, 47 | { 48 | "question": "What kind of energy conversion is involved in the formation of coal and petroleum?", 49 | "options": { 50 | "A": "Mechanical to chemical", 51 | "B": "Chemical to electrical", 52 | "C": "Solar to chemical", 53 | "D": "Nuclear to thermal" 54 | }, 55 | "correct_answer": "Solar to chemical", 56 | "explanation": "Coal and petroleum are formed from ancient organic matter that originally captured solar energy through photosynthesis, converting it into chemical energy." 57 | } 58 | ], 59 | "topic": "Energy", 60 | "chapter": null, 61 | "section": null, 62 | "generated_at": "2025-09-01T18:37:49.396834", 63 | "source_count": 8, 64 | "sources_preview": [ 65 | { 66 | "chapter": "Unknown", 67 | "section": "Unknown", 68 | "content_type": "Unknown", 69 | "preview": "Luckily the world we live in provides energy in many different forms. The various forms include mech..." 70 | }, 71 | { 72 | "chapter": "Unknown", 73 | "section": "Unknown", 74 | "content_type": "Unknown", 75 | "preview": "Life is impossible without energy. The demand for energy is ever increasing. Where do we get energy ..." 76 | }, 77 | { 78 | "chapter": "Unknown", 79 | "section": "Unknown", 80 | "content_type": "Unknown", 81 | "preview": "Questions Section: Questions\nQuestions:\n1. 1. What is power?\n2. 2. Define 1 watt of power.\n3. 3. A l..." 82 | } 83 | ] 84 | } -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # EduPlan AI: Automated MCQ Generator with RAG & Qdrant 2 | 3 | ## Overview 4 | EduPlan AI is an automated lesson and MCQ generator for educational content. It uses Retrieval-Augmented Generation (RAG) with a Qdrant vector database and OpenAI's GPT models to generate high-quality multiple-choice questions (MCQs) from your curriculum PDFs and processed data. 5 | 6 | ## Features 7 | - **Semantic Search**: Uses Qdrant vector database for fast, relevant retrieval of curriculum content. 8 | - **MCQ Generation**: Generates exactly 5 MCQs for any topic using OpenAI's GPT-4o. 9 | - **Flexible Filtering**: Filter MCQs by chapter, section, or content type. 10 | - **Embeddings**: Supports NVIDIA Qwen3-Embedding-4B and custom embedding pipelines. 11 | - **Easy Output**: MCQs are saved in `Output_Lesson_Plans` as JSON files for easy use. 12 | 13 | ## Folder Structure 14 | ``` 15 | Qwen3_PDF/ 16 | ├── config.py # Project configuration (Qdrant host, port, etc.) 17 | ├── docker-compose.yml # Qdrant database container setup 18 | ├── embedding_model.py # Embedding pipeline (Qwen3/NVIDIA) 19 | ├── lesson_plan_generator.py # Main MCQ generator script 20 | ├── process_improved_data.py # Data processing and embedding script 21 | ├── qdrant_connector.py # Qdrant database connector 22 | ├── requirements.txt # Python dependencies 23 | ├── Output_Lesson_Plans/ # Generated MCQ JSON files 24 | ├── rag_data/ # Raw and processed curriculum PDFs 25 | │ ├── raw/ # Original PDFs 26 | │ └── processed/ # Processed data (if any) 27 | ``` 28 | 29 | ## Setup 30 | 1. **Clone the repository** 31 | 2. **Install dependencies**: 32 | ```sh 33 | pip install -r requirements.txt 34 | ``` 35 | 3. **Start Qdrant database**: 36 | ```sh 37 | docker-compose up -d 38 | ``` 39 | 4. **Configure environment variables**: 40 | - Create a `.env` file in the project root with your OpenAI API key: 41 | ```env 42 | OPENAI_API_KEY=sk-... 43 | QDRANT_HOST=localhost 44 | QDRANT_PORT=6333 45 | QDRANT_COLLECTION_NAME=science_9_collection 46 | QDRANT_VECTOR_SIZE=2560 47 | ``` 48 | 49 | ## Usage 50 | ### 1. Process Curriculum Data 51 | If you have new or improved curriculum data, run: 52 | ```sh 53 | python process_improved_data.py 54 | ``` 55 | This will generate embeddings and store them in Qdrant. 56 | 57 | ### 2. Generate MCQs 58 | Run the MCQ generator for any topic: 59 | ```sh 60 | python lesson_plan_generator.py --topic "Energy" --chapter "10" --section "10.1" 61 | ``` 62 | - The generated MCQs will be saved in `Output_Lesson_Plans/MCQs_energy.json`. 63 | - You can omit `--chapter` and `--section` for broader search. 64 | 65 | ## Output Format 66 | Each MCQ JSON file contains: 67 | - 5 MCQs with question, options (A-D), correct answer, and explanation 68 | - Metadata: topic, chapter, section, source preview 69 | 70 | ## Customization 71 | - **Embeddings**: You can use your own embedding model by editing `embedding_model.py`. 72 | - **Qdrant Collection**: Change collection name/vector size in `config.py` and `.env`. 73 | - **MCQ Format**: Edit `lesson_plan_generator.py` for custom output formatting. 74 | 75 | ## Troubleshooting 76 | - Ensure Qdrant is running (`docker-compose up -d`). 77 | - Make sure your `.env` file contains a valid OpenAI API key. 78 | - Check `requirements.txt` for missing dependencies. 79 | - For import errors, run scripts from the project root. 80 | 81 | ## License 82 | MIT License 83 | 84 | ## Credits 85 | - [Qdrant Vector Database](https://qdrant.tech/) 86 | - [LangChain](https://langchain.com/) 87 | - [OpenAI GPT](https://platform.openai.com/) 88 | - [NVIDIA Qwen3 Embedding](https://huggingface.co/Qwen/Qwen3-Embedding-4B) 89 | -------------------------------------------------------------------------------- /qdrant_connector.py: -------------------------------------------------------------------------------- 1 | """ 2 | Qdrant vector database connector for EduPlan AI. 3 | This module provides a connector for interacting with the Qdrant vector database. 4 | """ 5 | 6 | import logging 7 | import httpx 8 | from typing import List, Dict, Any, Optional, Union 9 | from qdrant_client import QdrantClient 10 | from qdrant_client.http import models 11 | from qdrant_client.http.exceptions import UnexpectedResponse 12 | 13 | # Configure logging 14 | logging.basicConfig( 15 | level=logging.INFO, 16 | format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' 17 | ) 18 | logger = logging.getLogger(__name__) 19 | 20 | class QdrantConnector: 21 | """Connector for interacting with the Qdrant vector database.""" 22 | 23 | def __init__(self, host: str = "localhost", port: int = 6333, 24 | collection_name: str = "eduplan", vector_size: int = 4096): 25 | """ 26 | Initialize the Qdrant connector. 27 | 28 | Args: 29 | host: Qdrant server hostname 30 | port: Qdrant server port 31 | collection_name: Name of the collection to use 32 | vector_size: Dimensionality of the vectors to store 33 | """ 34 | self.host = host 35 | self.port = port 36 | self.collection_name = collection_name 37 | self.vector_size = vector_size 38 | 39 | # Initialize client 40 | try: 41 | self.client = QdrantClient(host=host, port=port) 42 | logger.debug(f"Connected to Qdrant at {host}:{port}") 43 | except Exception as e: 44 | logger.error(f"Error connecting to Qdrant: {e}") 45 | raise 46 | 47 | def recreate_collection(self) -> bool: 48 | """ 49 | Delete collection if it exists and create a new one. 50 | 51 | Returns: 52 | True if successful, False otherwise 53 | """ 54 | try: 55 | # Check if collection exists 56 | collections = self.client.get_collections().collections 57 | collection_names = [collection.name for collection in collections] 58 | 59 | if self.collection_name in collection_names: 60 | # Delete existing collection 61 | self.client.delete_collection(collection_name=self.collection_name) 62 | print(f"🗑️ Deleted existing collection: {self.collection_name}") 63 | 64 | # Create new collection 65 | self.client.create_collection( 66 | collection_name=self.collection_name, 67 | vectors_config=models.VectorParams( 68 | size=self.vector_size, 69 | distance=models.Distance.COSINE 70 | ) 71 | ) 72 | print(f"✅ Created new collection: {self.collection_name}") 73 | return True 74 | 75 | except Exception as e: 76 | logger.error(f"Error recreating collection: {e}") 77 | return False 78 | 79 | def get_collection_info(self) -> Dict[str, Any]: 80 | """ 81 | Get information about the collection. 82 | 83 | Returns: 84 | Dictionary containing collection information 85 | """ 86 | try: 87 | return self.client.get_collection(collection_name=self.collection_name) 88 | except Exception as e: 89 | logger.error(f"Error getting collection info: {e}") 90 | return {} 91 | 92 | def insert_documents(self, documents: List[Dict], embeddings: List[List[float]], batch_size: int = 2) -> bool: 93 | """ 94 | Insert documents with embeddings into Qdrant. 95 | 96 | Args: 97 | documents: List of document dictionaries with 'id', 'text', and 'metadata' 98 | embeddings: List of embedding vectors (must match documents length) 99 | batch_size: Number of documents to insert at once 100 | 101 | Returns: 102 | True if insertion was successful 103 | """ 104 | try: 105 | if len(documents) != len(embeddings): 106 | logger.error(f"Document count ({len(documents)}) does not match embeddings count ({len(embeddings)})") 107 | return False 108 | 109 | logger.info(f"Inserting {len(documents)} documents into collection '{self.collection_name}'") 110 | 111 | # Process in batches 112 | for i in range(0, len(documents), batch_size): 113 | batch_docs = documents[i:i+batch_size] 114 | batch_embeddings = embeddings[i:i+batch_size] 115 | 116 | # Prepare points for insertion 117 | points = [] 118 | for doc, emb in zip(batch_docs, batch_embeddings): 119 | # Ensure ID is a string or integer (not a list) 120 | doc_id = doc.get("id") 121 | if isinstance(doc_id, list): 122 | # If ID is a list, convert to string 123 | doc_id = str(doc_id) 124 | 125 | point = { 126 | "id": doc_id, 127 | "vector": emb, 128 | "payload": { 129 | "text": doc.get("text", ""), 130 | "metadata": doc.get("metadata", {}) 131 | } 132 | } 133 | points.append(point) 134 | 135 | # Insert batch 136 | self.client.upsert( 137 | collection_name=self.collection_name, 138 | points=points 139 | ) 140 | 141 | logger.info(f"Successfully inserted {len(documents)} documents") 142 | return True 143 | 144 | except Exception as e: 145 | logger.error(f"Error inserting documents: {str(e)}") 146 | return False 147 | 148 | def search_documents(self, query_vector: List[float], limit: int = 5, 149 | filter: Optional[Dict[str, Any]] = None) -> List[Dict[str, Any]]: 150 | """ 151 | Search for similar documents in the collection. 152 | 153 | Args: 154 | query_vector: Query embedding vector 155 | limit: Maximum number of results to return 156 | filter: Optional filter to apply to the search 157 | 158 | Returns: 159 | List of matching documents 160 | """ 161 | try: 162 | return self.client.search( 163 | collection_name=self.collection_name, 164 | query_vector=query_vector, 165 | limit=limit, 166 | query_filter=filter 167 | ) 168 | except Exception as e: 169 | logger.error(f"Error searching documents: {e}") 170 | return [] 171 | 172 | def delete_document(self, document_id: Union[str, int]) -> bool: 173 | """ 174 | Delete a document from the collection. 175 | 176 | Args: 177 | document_id: ID of the document to delete 178 | 179 | Returns: 180 | True if successful, False otherwise 181 | """ 182 | try: 183 | self.client.delete( 184 | collection_name=self.collection_name, 185 | points_selector=models.PointIdsList( 186 | points=[document_id] 187 | ) 188 | ) 189 | return True 190 | except Exception as e: 191 | logger.error(f"Error deleting document: {e}") 192 | return False 193 | 194 | def get_document(self, document_id: Union[str, int]) -> Optional[Dict[str, Any]]: 195 | """ 196 | Get a document from the collection by ID. 197 | 198 | Args: 199 | document_id: ID of the document to get 200 | 201 | Returns: 202 | Document if found, None otherwise 203 | """ 204 | try: 205 | results = self.client.retrieve( 206 | collection_name=self.collection_name, 207 | ids=[document_id] 208 | ) 209 | return results[0] if results else None 210 | except Exception as e: 211 | logger.error(f"Error getting document: {e}") 212 | return None 213 | 214 | # Example usage 215 | if __name__ == "__main__": 216 | # Test the connector 217 | connector = QdrantConnector( 218 | host="localhost", 219 | port=6333, 220 | collection_name="test_collection", 221 | vector_size=4 222 | ) 223 | 224 | # Create a test collection 225 | connector.recreate_collection() 226 | 227 | # Insert test documents 228 | test_docs = [ 229 | { 230 | "id": 1, 231 | "vector": [0.1, 0.2, 0.3, 0.4], 232 | "payload": {"text": "Test document 1", "metadata": {"source": "test"}} 233 | }, 234 | { 235 | "id": 2, 236 | "vector": [0.2, 0.3, 0.4, 0.5], 237 | "payload": {"text": "Test document 2", "metadata": {"source": "test"}} 238 | } 239 | ] 240 | 241 | connector.insert_documents(test_docs) 242 | 243 | # Search for similar documents 244 | results = connector.search_documents([0.1, 0.2, 0.3, 0.4], limit=1) 245 | print(f"Search results: {results}") 246 | 247 | -------------------------------------------------------------------------------- /embedding_model.py: -------------------------------------------------------------------------------- 1 | """ 2 | Embedding model for EduPlan AI. 3 | This module provides the NVEmbedPipeline class for generating embeddings 4 | using the NVIDIA NV-Embed model. 5 | """ 6 | 7 | import logging 8 | import time 9 | import torch 10 | from typing import List, Union, Dict, Any 11 | from transformers import AutoModel, AutoTokenizer 12 | import numpy as np 13 | 14 | # Configure logging 15 | logging.basicConfig( 16 | level=logging.INFO, 17 | format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' 18 | ) 19 | logger = logging.getLogger(__name__) 20 | 21 | class NVEmbedPipeline: 22 | """Pipeline for generating embeddings using NVIDIA NV-Embed.""" 23 | 24 | def __init__(self, model_name: str = "Qwen/Qwen3-Embedding-4B", device: str = None): 25 | """Initialize the NVEmbedPipeline.""" 26 | self.model_name = model_name 27 | 28 | # Use CUDA if available, otherwise fall back to CPU 29 | if device is None: 30 | self.device = "cuda:0" if torch.cuda.is_available() else "cpu" # Use main discrete GPU 31 | else: 32 | self.device = device 33 | 34 | # Load model and tokenizer 35 | self._load_model() 36 | 37 | def _load_model(self): 38 | """Load the NV-Embed model and tokenizer.""" 39 | try: 40 | # GPU memory management (ADD THIS) 41 | if self.device.startswith("cuda"): 42 | torch.cuda.set_per_process_memory_fraction(1.0) # Use only 70% of VRAM 43 | torch.cuda.empty_cache() 44 | 45 | # Print loading message 46 | print(f"🔄 Loading Qwen/Qwen3-Embedding-4B: {self.model_name}") 47 | print(f"🎯 Using device: {self.device}") 48 | 49 | # Use half precision for GPU to save memory 50 | dtype = torch.float16 if self.device.startswith("cuda") else torch.float32 51 | 52 | # dtype = torch.float16 53 | # Load tokenizer with trust_remote_code=True for NVIDIA models 54 | self.tokenizer = AutoTokenizer.from_pretrained( 55 | self.model_name, 56 | trust_remote_code=True 57 | ) 58 | 59 | # Load model with memory optimization (ADD low_cpu_mem_usage=True) 60 | self.model = AutoModel.from_pretrained( 61 | self.model_name, 62 | trust_remote_code=True, 63 | torch_dtype=dtype, 64 | low_cpu_mem_usage=True # ← ADD THIS 65 | ) 66 | 67 | # Move model to device 68 | self.model.to(self.device) 69 | 70 | # Save embedding dimension from config 71 | self.embedding_dim = 2560 # Hard-coded for NV-Embed-v2 72 | self.vector_size = self.embedding_dim # Add this for compatibility 73 | 74 | # Print success message 75 | print(f"✅ Qwen/Qwen3-Embedding-4B loaded successfully!") 76 | print(f" 📊 Vector size: {self.embedding_dim}") 77 | print(f" 🎯 Device: {self.device}") 78 | print(f" 📏 Model dtype: {dtype}") 79 | 80 | except Exception as e: 81 | logger.error(f"Error loading NV-Embed model: {e}") 82 | raise 83 | 84 | def embed_texts(self, texts: List[str], batch_size: int = 1) -> List[List[float]]: 85 | """Generate embeddings for a list of texts.""" 86 | print(f"🔄 Generating embeddings for {len(texts)} texts...") 87 | 88 | embeddings = [] 89 | 90 | # Process in batches 91 | for i in range(0, len(texts), batch_size): 92 | batch_texts = texts[i:i+batch_size] 93 | 94 | # Tokenize batch 95 | inputs = self.tokenizer( 96 | batch_texts, 97 | padding=True, 98 | truncation=True, 99 | max_length=512, 100 | return_tensors="pt" 101 | ).to(self.device) 102 | 103 | # Cast float tensors to model dtype 104 | for key in inputs: 105 | if torch.is_floating_point(inputs[key]): 106 | inputs[key] = inputs[key].to(self.model.dtype) 107 | 108 | try: 109 | # Generate embeddings with no gradient tracking 110 | with torch.no_grad(): 111 | outputs = self.model(**inputs) 112 | 113 | # Get embeddings - simpler approach with less memory usage 114 | if isinstance(outputs, dict): 115 | if "sentence_embeddings" in outputs: 116 | token_embeddings = outputs["sentence_embeddings"] 117 | 118 | # Check shape - don't print to reduce log spam 119 | if len(token_embeddings.shape) == 3: 120 | # Memory-efficient pooling: process one sequence at a time 121 | batch_embeddings = [] 122 | for seq_idx in range(token_embeddings.shape[0]): 123 | # Get single sequence tokens and its mask 124 | seq_tokens = token_embeddings[seq_idx] 125 | seq_mask = inputs["attention_mask"][seq_idx] 126 | 127 | # Apply mask and mean only for this sequence 128 | masked_tokens = seq_tokens * seq_mask.unsqueeze(-1) 129 | # Sum and divide by non-zero mask elements 130 | sum_tokens = torch.sum(masked_tokens, dim=0) 131 | token_count = torch.sum(seq_mask).item() 132 | if token_count > 0: 133 | mean_embedding = (sum_tokens / token_count).cpu().numpy() 134 | else: 135 | # Fallback if no tokens (shouldn't happen) 136 | mean_embedding = torch.zeros(self.embedding_dim).cpu().numpy() 137 | 138 | batch_embeddings.append(mean_embedding) 139 | 140 | # Convert to numpy array 141 | batch_embeddings = np.array(batch_embeddings) 142 | else: 143 | # Already sentence-level embeddings 144 | batch_embeddings = token_embeddings.cpu().numpy() 145 | else: 146 | # Alternative keys 147 | if "last_hidden_state" in outputs: 148 | # Similar memory-efficient approach for last_hidden_state 149 | token_embeddings = outputs["last_hidden_state"] 150 | batch_embeddings = [] 151 | for seq_idx in range(token_embeddings.shape[0]): 152 | seq_tokens = token_embeddings[seq_idx] 153 | seq_mask = inputs["attention_mask"][seq_idx] 154 | masked_tokens = seq_tokens * seq_mask.unsqueeze(-1) 155 | sum_tokens = torch.sum(masked_tokens, dim=0) 156 | token_count = torch.sum(seq_mask).item() 157 | if token_count > 0: 158 | mean_embedding = (sum_tokens / token_count).cpu().numpy() 159 | else: 160 | mean_embedding = torch.zeros(self.embedding_dim).cpu().numpy() 161 | batch_embeddings.append(mean_embedding) 162 | 163 | batch_embeddings = np.array(batch_embeddings) 164 | else: 165 | # Try to find any usable tensor 166 | usable_key = None 167 | for key, value in outputs.items(): 168 | if isinstance(value, torch.Tensor) and value.dim() >= 2: 169 | usable_key = key 170 | break 171 | 172 | if usable_key: 173 | logger.info(f"Using fallback key: {usable_key}") 174 | batch_embeddings = outputs[usable_key].cpu().numpy() 175 | else: 176 | raise ValueError("Cannot find usable embeddings in model output") 177 | else: 178 | # Direct tensor - use memory-efficient approach 179 | token_embeddings = outputs 180 | batch_embeddings = [] 181 | for seq_idx in range(token_embeddings.shape[0]): 182 | seq_tokens = token_embeddings[seq_idx] 183 | seq_mask = inputs["attention_mask"][seq_idx] 184 | masked_tokens = seq_tokens * seq_mask.unsqueeze(-1) 185 | sum_tokens = torch.sum(masked_tokens, dim=0) 186 | token_count = torch.sum(seq_mask).item() 187 | if token_count > 0: 188 | mean_embedding = (sum_tokens / token_count).cpu().numpy() 189 | else: 190 | mean_embedding = torch.zeros(self.embedding_dim).cpu().numpy() 191 | batch_embeddings.append(mean_embedding) 192 | 193 | batch_embeddings = np.array(batch_embeddings) 194 | 195 | # Add to results 196 | embeddings.extend(batch_embeddings) 197 | 198 | except RuntimeError as e: 199 | if 'out of memory' in str(e): 200 | # Out of memory, clean up and try with smaller batch 201 | torch.cuda.empty_cache() 202 | logger.warning(f"GPU out of memory, reducing batch size and retrying...") 203 | 204 | if batch_size > 1: 205 | # Try with batch_size of 1 206 | for text in batch_texts: 207 | try: 208 | # Process one text at a time 209 | single_embedding = self.embed_query(text) 210 | embeddings.append(single_embedding) 211 | except Exception as inner_e: 212 | logger.error(f"Error processing single text: {inner_e}") 213 | # Add zeros as fallback 214 | embeddings.append([0.0] * self.embedding_dim) 215 | else: 216 | # Even batch_size=1 failed, add zeros as fallback 217 | logger.error(f"Cannot process even with batch_size=1: {e}") 218 | for _ in batch_texts: 219 | embeddings.append([0.0] * self.embedding_dim) 220 | else: 221 | # Other error 222 | raise 223 | 224 | # Log progress 225 | print(f" 📊 Processed {min(i+batch_size, len(texts))}/{len(texts)} texts ({self.device.upper()})") 226 | 227 | # Free GPU memory 228 | if self.device.startswith("cuda"): 229 | torch.cuda.empty_cache() 230 | 231 | # Convert to lists for consistent output 232 | result = [] 233 | for emb in embeddings: 234 | # Convert to list and ensure correct dimension 235 | if isinstance(emb, np.ndarray): 236 | emb_list = emb.tolist() 237 | else: 238 | emb_list = list(emb) 239 | 240 | # Check dimensions 241 | if len(emb_list) != self.vector_size: 242 | logger.warning(f"Fixing dimension: {len(emb_list)} → {self.vector_size}") 243 | if len(emb_list) < self.vector_size: 244 | # Pad with zeros 245 | emb_list = emb_list + [0.0] * (self.vector_size - len(emb_list)) 246 | else: 247 | # Truncate 248 | emb_list = emb_list[:self.vector_size] 249 | 250 | result.append(emb_list) 251 | 252 | print(f"✅ Generated {len(result)} embeddings with dimension {self.vector_size}") 253 | return result 254 | 255 | def embed_query(self, text: str) -> List[float]: 256 | """Generate embedding for a single query text.""" 257 | try: 258 | print(torch.cuda.device_count()) 259 | print(torch.cuda.get_device_name(0)) 260 | print(torch.cuda.get_device_name(1)) 261 | except Exception as e: 262 | print(f"Error occurred while accessing GPU: {e}") 263 | result = self.embed_texts([text], batch_size=1) 264 | return result[0] if result else [] 265 | -------------------------------------------------------------------------------- /process_improved_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Process improved data for EduPlan AI system. 4 | This script loads the improved JSON data, generates embeddings using NV-Embed, 5 | and stores them in the Qdrant vector database for efficient retrieval. 6 | """ 7 | 8 | import sys 9 | import os 10 | import json 11 | import time 12 | from typing import List, Dict, Any, Tuple 13 | from pathlib import Path 14 | import logging 15 | 16 | # Add parent directory to path to import modules 17 | sys.path.append(os.path.join(os.path.dirname(__file__), '../..')) 18 | 19 | # Import required modules 20 | from src.models.embedding_model import NVEmbedPipeline 21 | from src.database.qdrant_connector import QdrantConnector 22 | from src.core.config import QDRANT_COLLECTION_NAME, QDRANT_HOST, QDRANT_PORT, QDRANT_VECTOR_SIZE 23 | 24 | # Configure logging 25 | logging.basicConfig( 26 | level=logging.INFO, 27 | format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' 28 | ) 29 | logger = logging.getLogger(__name__) 30 | 31 | def load_improved_data(data_dir: str = "../../data/processed_improved") -> List[Dict[str, Any]]: 32 | """ 33 | Load all improved data files from the specified directory. 34 | 35 | Args: 36 | data_dir: Directory containing improved JSON data files 37 | 38 | Returns: 39 | List of dictionaries containing the loaded data 40 | """ 41 | # Resolve data path relative to this script file 42 | script_dir = Path(__file__).resolve().parent 43 | data_path = script_dir / data_dir 44 | logger.info(f"Loading improved data from {data_path}") 45 | 46 | all_data = [] 47 | 48 | if not data_path.exists(): 49 | logger.error(f"Data directory not found: {data_path}") 50 | return all_data 51 | 52 | # Find all JSON files with the new naming pattern (Chapter_X_Y.json) 53 | json_files = sorted(data_path.glob("Chapter_*.json")) 54 | 55 | if not json_files: 56 | logger.warning(f"No Chapter_*.json files found in {data_path}") 57 | return all_data 58 | 59 | logger.info(f"Found {len(json_files)} JSON files to load") 60 | 61 | for json_file in json_files: 62 | try: 63 | with open(json_file, 'r', encoding='utf-8') as f: 64 | data = json.load(f) 65 | logger.info(f"Loaded {json_file.name}: {len(data) if isinstance(data, list) else '1'} items") 66 | all_data.append({ 67 | "file": json_file.name, 68 | "data": data 69 | }) 70 | except Exception as e: 71 | logger.error(f"Error loading {json_file}: {e}") 72 | 73 | return all_data 74 | 75 | def extract_chapter_info(filename: str) -> Tuple[str, str]: 76 | """ 77 | Extract chapter number and chunk info from filename. 78 | 79 | Args: 80 | filename: Filename like "Chapter_1_1.json" 81 | 82 | Returns: 83 | Tuple of (chapter_number, chunk_number) 84 | """ 85 | try: 86 | # Parse filename: Chapter_X_Y.json -> chapter=X, chunk=Y 87 | name_parts = filename.replace('.json', '').split('_') 88 | if len(name_parts) >= 3: 89 | chapter_num = name_parts[1] 90 | chunk_num = name_parts[2] 91 | return chapter_num, chunk_num 92 | else: 93 | # Fallback for unexpected format 94 | return name_parts[1] if len(name_parts) > 1 else "unknown", "1" 95 | except Exception as e: 96 | logger.warning(f"Could not parse filename {filename}: {e}") 97 | return "unknown", "1" 98 | 99 | def extract_content_from_block(content_block: Dict[str, Any]) -> str: 100 | """ 101 | Extract text content from a content block based on its type. 102 | 103 | Args: 104 | content_block: Dictionary containing content block data 105 | 106 | Returns: 107 | Extracted text content 108 | """ 109 | block_type = content_block.get("type", "unknown") 110 | content_parts = [] 111 | 112 | if block_type == "text": 113 | content = content_block.get("content", "") 114 | if content: 115 | content_parts.append(content) 116 | 117 | elif block_type == "summary": 118 | title = content_block.get("title", "") 119 | if title: 120 | content_parts.append(f"Summary: {title}") 121 | 122 | summary_points = content_block.get("summary_points", []) 123 | if summary_points: 124 | content_parts.append("Key Points:") 125 | for point in summary_points: 126 | content_parts.append(f"• {point}") 127 | 128 | elif block_type == "activity": 129 | activity_num = content_block.get("activity_number", "") 130 | title = content_block.get("title", "") 131 | description = content_block.get("description", "") 132 | questions = content_block.get("questions", []) 133 | 134 | # Build activity content 135 | activity_parts = [] 136 | if activity_num: 137 | activity_parts.append(f"Activity {activity_num}") 138 | if title: 139 | activity_parts.append(f"Title: {title}") 140 | if description: 141 | activity_parts.append(f"Description: {description}") 142 | if questions: 143 | activity_parts.append("Questions:") 144 | for i, question in enumerate(questions, 1): 145 | activity_parts.append(f"{i}. {question}") 146 | 147 | if activity_parts: 148 | content_parts.append("Activity:\n" + "\n".join(activity_parts)) 149 | 150 | elif block_type == "questions": 151 | title = content_block.get("title", "") 152 | questions = content_block.get("questions", []) 153 | 154 | if title: 155 | content_parts.append(f"Questions Section: {title}") 156 | 157 | if questions: 158 | content_parts.append("Questions:") 159 | for i, question in enumerate(questions, 1): 160 | content_parts.append(f"{i}. {question}") 161 | 162 | return "\n".join(content_parts) 163 | 164 | def prepare_documents(improved_data: List[Dict[str, Any]]) -> Tuple[List[str], List[Dict[str, Any]]]: 165 | """ 166 | Extract text and metadata from improved data with the new chunked format. 167 | 168 | Args: 169 | improved_data: List of dictionaries containing improved data 170 | 171 | Returns: 172 | Tuple containing (text_chunks, metadata) 173 | """ 174 | texts = [] 175 | metadata = [] 176 | 177 | for file_data in improved_data: 178 | filename = file_data["file"] 179 | data = file_data["data"] 180 | 181 | # Extract chapter and chunk info from new filename format 182 | chapter_num, chunk_num = extract_chapter_info(filename) 183 | 184 | logger.info(f"Processing {filename} - Chapter {chapter_num}, Chunk {chunk_num}") 185 | 186 | # Handle the new format: data is a list with chapter objects 187 | if isinstance(data, list) and len(data) > 0: 188 | for chapter_idx, chapter_data in enumerate(data): 189 | if not isinstance(chapter_data, dict): 190 | continue 191 | 192 | chapter_number = chapter_data.get("chapter_number", chapter_num) 193 | chapter_title = chapter_data.get("chapter_title", f"Chapter {chapter_num}") 194 | 195 | # Process all sections in this chapter 196 | sections = chapter_data.get("sections", []) 197 | 198 | for section_idx, section in enumerate(sections): 199 | section_number = section.get("section_number", "") 200 | section_title = section.get("section_title", "") 201 | 202 | # Process content blocks in main section 203 | content_blocks = section.get("content_blocks", []) 204 | 205 | for block_idx, content_block in enumerate(content_blocks): 206 | if not isinstance(content_block, dict): 207 | continue 208 | 209 | # Extract text content from this block 210 | text_content = extract_content_from_block(content_block) 211 | 212 | # Only add if we have substantial content 213 | if text_content and len(text_content.strip()) > 10: 214 | texts.append(text_content.strip()) 215 | 216 | # Create metadata for this content block 217 | meta = { 218 | "id": f"ch{chapter_num}_chunk{chunk_num}_s{section_idx}_b{block_idx}", 219 | "chapter_number": chapter_number, 220 | "chapter_title": chapter_title, 221 | "section_number": section_number or f"Section {section_idx + 1}", 222 | "section_title": section_title, 223 | "content_type": content_block.get("type", "unknown"), 224 | "source_file": filename, 225 | "chunk_number": chunk_num, 226 | "original_chapter": f"Chapter {chapter_num}", 227 | "block_index": block_idx 228 | } 229 | metadata.append(meta) 230 | 231 | # Process sub-sections 232 | sub_sections = section.get("sub_sections", []) 233 | 234 | for sub_idx, sub_section in enumerate(sub_sections): 235 | sub_section_number = sub_section.get("section_number", "") 236 | sub_section_title = sub_section.get("section_title", "") 237 | 238 | # Process content blocks in sub-sections 239 | sub_content_blocks = sub_section.get("content_blocks", []) 240 | 241 | for sub_block_idx, content_block in enumerate(sub_content_blocks): 242 | if not isinstance(content_block, dict): 243 | continue 244 | 245 | # Extract text content from this block 246 | text_content = extract_content_from_block(content_block) 247 | 248 | # Only add if we have substantial content 249 | if text_content and len(text_content.strip()) > 10: 250 | texts.append(text_content.strip()) 251 | 252 | # Create metadata for this sub-section content block 253 | meta = { 254 | "id": f"ch{chapter_num}_chunk{chunk_num}_s{section_idx}_sub{sub_idx}_b{sub_block_idx}", 255 | "chapter_number": chapter_number, 256 | "chapter_title": chapter_title, 257 | "section_number": section_number or f"Section {section_idx + 1}", 258 | "section_title": section_title, 259 | "sub_section_number": sub_section_number, 260 | "sub_section_title": sub_section_title, 261 | "content_type": content_block.get("type", "unknown"), 262 | "source_file": filename, 263 | "chunk_number": chunk_num, 264 | "original_chapter": f"Chapter {chapter_num}", 265 | "block_index": sub_block_idx, 266 | "is_sub_section": True 267 | } 268 | metadata.append(meta) 269 | 270 | else: 271 | logger.warning(f"Unexpected data format for file: {filename}") 272 | 273 | logger.info(f"Prepared {len(texts)} documents with metadata") 274 | return texts, metadata 275 | 276 | def generate_embeddings(texts: List[str]) -> List[List[float]]: 277 | """ 278 | Generate embeddings for text chunks using NV-Embed. 279 | 280 | Args: 281 | texts: List of text chunks to embed 282 | 283 | Returns: 284 | List of embedding vectors 285 | """ 286 | logger.info("Initializing NV-Embed model...") 287 | embedding_model = NVEmbedPipeline() 288 | 289 | logger.info(f"Generating embeddings for {len(texts)} documents...") 290 | start_time = time.time() 291 | embeddings = embedding_model.embed_texts(texts) 292 | elapsed = time.time() - start_time 293 | 294 | logger.info(f"Generated {len(embeddings)} embeddings in {elapsed:.2f} seconds") 295 | 296 | # Debug: Check the format of embeddings 297 | if embeddings and len(embeddings) > 0: 298 | first_emb = embeddings[0] 299 | logger.info(f"First embedding type: {type(first_emb)}") 300 | logger.info(f"First embedding length: {len(first_emb)}") 301 | if hasattr(first_emb, 'tolist') and callable(getattr(first_emb, 'tolist')): 302 | logger.info("Converting embeddings from numpy/tensor to list format") 303 | embeddings = [emb.tolist() for emb in embeddings] 304 | 305 | return embeddings 306 | 307 | def store_in_database(texts, embeddings, metadata, collection_name="science_9_collection"): 308 | """ 309 | Store documents and embeddings in Qdrant. 310 | 311 | Args: 312 | texts: List of text chunks 313 | embeddings: List of embedding vectors 314 | metadata: List of metadata dictionaries 315 | collection_name: Name of the Qdrant collection 316 | """ 317 | logger.info(f"Storing data in collection: {collection_name}") 318 | 319 | # Always use 2560 for QWEN-4B size 320 | vector_size = 2560 321 | logger.info(f"Using vector size: {vector_size}") 322 | 323 | # Create Qdrant connector 324 | qdrant = QdrantConnector( 325 | host=QDRANT_HOST, 326 | port=QDRANT_PORT, 327 | collection_name=collection_name, 328 | vector_size=vector_size 329 | ) 330 | 331 | # Create collection if it doesn't exist 332 | qdrant.recreate_collection() 333 | 334 | # Prepare documents for insertion 335 | documents = [] 336 | for i, (text, meta) in enumerate(zip(texts, metadata)): 337 | # Use numeric ID (required by Qdrant) but save original ID in metadata 338 | original_id = meta.get("id", f"doc_{i}") 339 | meta["original_id"] = original_id # Keep the original ID in metadata 340 | 341 | doc = { 342 | "id": i, # Use simple numeric ID for Qdrant 343 | "text": text, 344 | "metadata": meta 345 | } 346 | documents.append(doc) 347 | 348 | # Insert documents 349 | success = qdrant.insert_documents(documents, embeddings) 350 | 351 | if success: 352 | logger.info(f"Successfully stored {len(documents)} documents in database") 353 | else: 354 | logger.error("Failed to store documents in database") 355 | 356 | return success 357 | 358 | def main(): 359 | """Main processing function""" 360 | logger.info("Starting improved data processing with NV-Embed") 361 | 362 | # Load improved data 363 | improved_data = load_improved_data() 364 | if not improved_data: 365 | logger.error("No improved data found. Exiting.") 366 | return 367 | 368 | # Prepare documents 369 | texts, metadata = prepare_documents(improved_data) 370 | if not texts: 371 | logger.error("No text chunks extracted. Exiting.") 372 | return 373 | 374 | # Generate embeddings 375 | embeddings = generate_embeddings(texts) 376 | if not embeddings or len(embeddings) != len(texts): 377 | logger.error(f"Embedding generation failed. Got {len(embeddings)} embeddings for {len(texts)} texts.") 378 | return 379 | 380 | # Store in database 381 | success = store_in_database(texts, embeddings, metadata) 382 | 383 | if success: 384 | logger.info(f"✅ Processing completed successfully!") 385 | logger.info(f" Processed {len(texts)} documents across {len(improved_data)} files") 386 | logger.info(f" Files processed: {[data['file'] for data in improved_data]}") 387 | else: 388 | logger.error("❌ Processing failed.") 389 | 390 | if __name__ == "__main__": 391 | main() -------------------------------------------------------------------------------- /lesson_plan_generator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Simplified MCQ Generator for EduPlan AI 4 | Generates exactly 5 MCQs for any given topic and saves them in Output_Lesson_Plans 5 | """ 6 | 7 | import os 8 | import logging 9 | import json 10 | from typing import List, Dict, Any, Optional 11 | from datetime import datetime 12 | from pathlib import Path 13 | from langchain_qdrant import Qdrant 14 | from langchain_core.prompts import PromptTemplate 15 | from langchain_core.output_parsers import PydanticOutputParser 16 | from pydantic import BaseModel, Field 17 | from langchain_openai import ChatOpenAI 18 | from langchain_community.embeddings import HuggingFaceEmbeddings 19 | from qdrant_client import QdrantClient, models 20 | 21 | # Local imports - adjust these paths according to your project structure 22 | from qdrant_connector import QdrantConnector 23 | from config import QDRANT_HOST, QDRANT_PORT, QDRANT_COLLECTION_NAME, QDRANT_VECTOR_SIZE 24 | 25 | logging.basicConfig(level=logging.INFO) 26 | logger = logging.getLogger(__name__) 27 | 28 | class MCQQuestion(BaseModel): 29 | question: str 30 | option_a: str 31 | option_b: str 32 | option_c: str 33 | option_d: str 34 | correct_answer: str 35 | explanation: str 36 | 37 | class MCQSet(BaseModel): 38 | mcqs: List[MCQQuestion] 39 | 40 | class MCQGenerator: 41 | def __init__(self, openai_api_key: Optional[str] = None): 42 | self.openai_api_key = openai_api_key or os.getenv("OPENAI_API_KEY") 43 | if not self.openai_api_key: 44 | raise ValueError("❌ OpenAI API key not found. Set OPENAI_API_KEY environment variable.") 45 | 46 | self.qdrant_connector = QdrantConnector( 47 | host=QDRANT_HOST, 48 | port=QDRANT_PORT, 49 | collection_name=QDRANT_COLLECTION_NAME, 50 | vector_size=QDRANT_VECTOR_SIZE 51 | ) 52 | self.embeddings = HuggingFaceEmbeddings( 53 | model_name="Qwen/Qwen3-Embedding-4B", 54 | model_kwargs={'device': 'cpu'}, 55 | encode_kwargs={'normalize_embeddings': True} 56 | ) 57 | self.vector_store = self._initialize_vector_store() 58 | self.llm = ChatOpenAI( 59 | model="gpt-4o", # <-- FIXED MODEL NAME 60 | temperature=0.3, 61 | openai_api_key=self.openai_api_key 62 | ) 63 | self.output_parser = PydanticOutputParser(pydantic_object=MCQSet) 64 | logger.info("✅ MCQ Generator initialized successfully") 65 | 66 | def _initialize_vector_store(self): 67 | try: 68 | vector_store = Qdrant( 69 | client=self.qdrant_connector.client, 70 | collection_name=QDRANT_COLLECTION_NAME, 71 | embeddings=self.embeddings 72 | ) 73 | logger.info("✅ Connected to Qdrant vector store") 74 | return vector_store 75 | except Exception as e: 76 | logger.error(f"❌ Failed to initialize vector store: {e}") 77 | return None 78 | 79 | def retrieve_relevant_content(self, topic: str, chapter: str = None, section: str = None, 80 | content_type: str = None, top_k: int = 8) -> List[Any]: 81 | search_query = topic 82 | logger.info(f"🔍 Searching for topic: '{search_query}'") 83 | query_embedding = self.embeddings.embed_query(search_query) 84 | must_conditions = [] 85 | if chapter: 86 | must_conditions.append(models.FieldCondition(key="chapter_number", match=models.MatchValue(value=str(chapter)))) 87 | if section: 88 | must_conditions.append(models.FieldCondition(key="section_number", match=models.MatchValue(value=str(section)))) 89 | if content_type: 90 | must_conditions.append(models.FieldCondition(key="content_type", match=models.MatchValue(value=content_type))) 91 | query_filter = models.Filter(must=must_conditions) if must_conditions else None 92 | 93 | try: 94 | client = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT) 95 | if query_filter: 96 | logger.info(f"🎯 Applying filters: {must_conditions}") 97 | results = client.query_points( 98 | collection_name=QDRANT_COLLECTION_NAME, 99 | query=query_embedding, 100 | query_filter=query_filter, 101 | limit=top_k, 102 | search_params=models.SearchParams(hnsw_ef=128, exact=False) 103 | ) 104 | else: 105 | logger.info("🌐 Searching all documents without filters") 106 | results = client.query_points( 107 | collection_name=QDRANT_COLLECTION_NAME, 108 | query=query_embedding, 109 | limit=top_k, 110 | search_params=models.SearchParams(hnsw_ef=128, exact=False) 111 | ) 112 | logger.info(f"✅ Found {len(results.points)} relevant documents via Query API") 113 | docs = [] 114 | for point in results.points: 115 | page_content = (point.payload.get("text", "") or 116 | point.payload.get("content", "") or 117 | point.payload.get("chunk_text", "") or 118 | str(point.payload)) 119 | metadata = point.payload.copy() 120 | doc = type('Doc', (), {})() 121 | doc.page_content = page_content 122 | doc.metadata = metadata 123 | doc.score = getattr(point, 'score', None) 124 | docs.append(doc) 125 | if docs: 126 | logger.info(f"📄 Sample doc metadata: {docs[0].metadata}") 127 | logger.info(f"📝 Sample content length: {len(docs[0].page_content)} chars") 128 | return docs 129 | except Exception as e: 130 | logger.error(f"❌ Error retrieving content via Query API: {e}") 131 | logger.error(f"🔍 Query details - Collection: {QDRANT_COLLECTION_NAME}, Filter: {query_filter}") 132 | return [] 133 | 134 | def generate_mcqs(self, topic: str, chapter: str = None, section: str = None, 135 | content_type: str = None) -> Dict[str, Any]: 136 | relevant_docs = self.retrieve_relevant_content(topic, chapter, section, content_type) 137 | if not relevant_docs: 138 | logger.warning("⚠️ No documents found. Trying without filters...") 139 | relevant_docs = self.retrieve_relevant_content(topic) 140 | if not relevant_docs: 141 | return { 142 | "error": "No relevant content found in the knowledge base", 143 | "topic": topic, 144 | "suggestions": [ 145 | "Check if Qdrant contains data", 146 | "Try a different topic", 147 | "Check if embedding model matches the one used for indexing" 148 | ] 149 | } 150 | context = self._extract_context_from_docs(relevant_docs) 151 | try: 152 | mcq_set = self._generate_mcqs_with_openai(topic, context) 153 | return { 154 | "mcqs": [ 155 | { 156 | "question": mcq.question, 157 | "options": { 158 | "A": mcq.option_a, 159 | "B": mcq.option_b, 160 | "C": mcq.option_c, 161 | "D": mcq.option_d 162 | }, 163 | "correct_answer": mcq.correct_answer, 164 | "explanation": mcq.explanation 165 | } 166 | for mcq in mcq_set.mcqs 167 | ], 168 | "topic": topic, 169 | "chapter": chapter, 170 | "section": section, 171 | "generated_at": datetime.now().isoformat(), 172 | "source_count": len(relevant_docs), 173 | "sources_preview": [ 174 | { 175 | "chapter": doc.metadata.get('chapter_title', 'Unknown'), 176 | "section": doc.metadata.get('section_title', 'Unknown'), 177 | "content_type": doc.metadata.get('content_type', 'Unknown'), 178 | "preview": doc.page_content[:100] + "..." if len(doc.page_content) > 100 else doc.page_content 179 | } 180 | for doc in relevant_docs[:3] 181 | ] 182 | } 183 | except Exception as e: 184 | logger.error(f"❌ Error generating MCQs: {e}") 185 | return { 186 | "error": f"Failed to generate MCQs: {str(e)}", 187 | "topic": topic, 188 | "context_found": len(relevant_docs) > 0, 189 | "context_preview": relevant_docs[0].page_content[:200] if relevant_docs else "No context" 190 | } 191 | 192 | def _extract_context_from_docs(self, docs: List[Any]) -> str: 193 | context_parts = [] 194 | for i, doc in enumerate(docs): 195 | content = doc.page_content.strip() 196 | metadata = doc.metadata 197 | context_part = f""" 198 | Document {i+1}: 199 | Chapter: {metadata.get('chapter_title', 'Unknown')} 200 | Section: {metadata.get('section_title', 'Unknown')} 201 | Content: {content} 202 | """ 203 | context_parts.append(context_part) 204 | return "\n".join(context_parts) 205 | 206 | def _generate_mcqs_with_openai(self, topic: str, context: str) -> MCQSet: 207 | prompt_template = """ 208 | You are an expert educator and assessment designer. Create exactly 5 multiple choice questions (MCQs) based on the provided curriculum content. 209 | 210 | CURRICULUM CONTENT: 211 | {context} 212 | 213 | REQUIREMENTS: 214 | - Topic: {topic} 215 | - Generate exactly 5 MCQs based on the provided content 216 | - Each question should have 4 options (A, B, C, D) 217 | - Questions should test different levels of understanding (knowledge, comprehension, application, analysis) 218 | - Questions should be clear, unambiguous, and educational 219 | - Provide brief explanations for correct answers 220 | - Ensure questions are directly based on the provided curriculum content 221 | - Avoid trick questions or overly complex language 222 | 223 | QUESTION DIFFICULTY LEVELS: 224 | 1. Basic knowledge/recall 225 | 2. Comprehension/understanding 226 | 3. Application of concepts 227 | 4. Analysis/evaluation 228 | 5. Synthesis/problem-solving 229 | 230 | {format_instructions} 231 | """ 232 | prompt = PromptTemplate( 233 | template=prompt_template, 234 | input_variables=["context", "topic"], 235 | partial_variables={"format_instructions": self.output_parser.get_format_instructions()} 236 | ) 237 | chain = prompt | self.llm | self.output_parser 238 | result = chain.invoke({ 239 | "context": context, 240 | "topic": topic 241 | }) 242 | return result 243 | 244 | def save_mcqs(self, mcqs_data: Dict[str, Any], output_dir: str = "Output_Lesson_Plans"): 245 | safe_topic = "_".join(mcqs_data['topic'].lower().split()) 246 | out_path = Path(output_dir) / f"MCQs_{safe_topic}.json" 247 | with open(out_path, "w", encoding="utf-8") as f: 248 | json.dump(mcqs_data, f, indent=2, ensure_ascii=False) 249 | logger.info(f"Saved MCQs to {out_path}") 250 | 251 | def format_mcqs_for_display(self, mcqs_data: Dict[str, Any]) -> str: 252 | if "error" in mcqs_data: 253 | error_output = f"❌ Error: {mcqs_data['error']}\n" 254 | if "context_preview" in mcqs_data: 255 | error_output += f"📄 Context found: {mcqs_data.get('context_found', False)}\n" 256 | error_output += f"🔍 Preview: {mcqs_data['context_preview']}\n" 257 | return error_output 258 | output = f"\n# MCQs: {mcqs_data['topic']}\n" 259 | if mcqs_data.get('chapter'): 260 | output += f"**Chapter:** {mcqs_data['chapter']} | " 261 | if mcqs_data.get('section'): 262 | output += f"**Section:** {mcqs_data['section']} | " 263 | output += f"""**Generated:** {mcqs_data['generated_at'][:19]} | **Sources:** {mcqs_data['source_count']} documents\n\n## 📚 Source Materials Used:\n""" 264 | for i, source in enumerate(mcqs_data.get('sources_preview', []), 1): 265 | output += f"{i}. **{source['chapter']}** - {source['section']} ({source['content_type']})\n" 266 | output += f" Preview: {source['preview']}\n\n" 267 | output += "---\n" 268 | for i, mcq in enumerate(mcqs_data['mcqs'], 1): 269 | output += f""" 270 | **Question {i}:** {mcq['question']} 271 | 272 | A) {mcq['options']['A']} 273 | B) {mcq['options']['B']} 274 | C) {mcq['options']['C']} 275 | D) {mcq['options']['D']} 276 | 277 | **Answer:** {mcq['correct_answer']} 278 | **Explanation:** {mcq['explanation']} 279 | 280 | --- 281 | """ 282 | return output.strip() 283 | 284 | def main(): 285 | try: 286 | from dotenv import load_dotenv 287 | dotenv_path = Path(__file__).parent / '.env' 288 | if dotenv_path.exists(): 289 | load_dotenv(dotenv_path) 290 | print(f"🔑 Loaded environment variables from {dotenv_path}") 291 | else: 292 | load_dotenv() 293 | print("🔑 Loaded environment variables from default .env location") 294 | except ImportError: 295 | print("⚠️ python-dotenv not installed. Environment variables may not be loaded from .env.") 296 | import argparse 297 | parser = argparse.ArgumentParser(description="Generate 5 MCQs for a topic using OpenAI and save to Output_Lesson_Plans.") 298 | parser.add_argument("--topic", type=str, required=True, help="Topic for MCQ generation") 299 | parser.add_argument("--chapter", type=str, help="Chapter number (optional)") 300 | parser.add_argument("--section", type=str, help="Section number (optional)") 301 | parser.add_argument("--content_type", type=str, help="Content type filter (optional)") 302 | args = parser.parse_args() 303 | 304 | generator = MCQGenerator() 305 | result = generator.generate_mcqs( 306 | topic=args.topic, 307 | chapter=args.chapter, 308 | section=args.section, 309 | content_type=args.content_type 310 | ) 311 | generator.save_mcqs(result) 312 | print(generator.format_mcqs_for_display(result)) 313 | 314 | if __name__ == "__main__": 315 | main() 316 | # #!/usr/bin/env python3 317 | # """ 318 | # LangChain-based Lesson Plan Generator for EduPlan AI 319 | # Uses RAG (Retrieval-Augmented Generation) with Qdrant vector database 320 | # """ 321 | 322 | # import os 323 | # import logging 324 | # from typing import List, Dict, Any, Optional 325 | # from datetime import datetime 326 | # from langchain_qdrant import Qdrant 327 | # # LangChain imports 328 | # from langchain_core.prompts import PromptTemplate 329 | # from langchain_core.output_parsers import PydanticOutputParser 330 | # from pydantic import BaseModel, Field 331 | # from langchain_openai import ChatOpenAI 332 | # from langchain_community.embeddings import HuggingFaceEmbeddings 333 | # from langchain.chains import RetrievalQA 334 | # from langchain.chains.question_answering import load_qa_chain 335 | 336 | # # Local imports 337 | # from ..database.qdrant_connector import QdrantConnector 338 | # from ..config import QDRANT_HOST, QDRANT_PORT, QDRANT_COLLECTION_NAME, QDRANT_VECTOR_SIZE 339 | 340 | # # Configure logging 341 | # logging.basicConfig(level=logging.INFO) 342 | # logger = logging.getLogger(__name__) 343 | 344 | 345 | # class LessonPlanStructure(BaseModel): 346 | # """Structured lesson plan output""" 347 | # title: str = Field(description="Lesson title") 348 | # subject: str = Field(description="Subject area") 349 | # grade_level: str = Field(description="Target grade level") 350 | # duration: str = Field(description="Lesson duration") 351 | # objectives: List[str] = Field(description="Learning objectives") 352 | # materials: List[str] = Field(description="Required materials") 353 | # introduction: str = Field(description="Introduction activity") 354 | # main_activities: List[str] = Field(description="Main teaching activities") 355 | # assessment: str = Field(description="Assessment strategy") 356 | # differentiation: str = Field(description="Differentiation strategies") 357 | # standards: List[str] = Field(description="Curriculum standards alignment") 358 | 359 | 360 | # class LangChainLessonGenerator: 361 | # """LangChain-powered lesson plan generator using Qdrant embeddings""" 362 | 363 | # def __init__(self, openai_api_key: Optional[str] = None): 364 | # """Initialize the lesson generator with LangChain components""" 365 | 366 | # # Initialize OpenAI API key 367 | # self.openai_api_key = openai_api_key or os.getenv("OPENAI_API_KEY") 368 | # if not self.openai_api_key: 369 | # logger.warning("⚠️ OpenAI API key not found. Set OPENAI_API_KEY environment variable.") 370 | 371 | # # Initialize Qdrant connection 372 | # self.qdrant_connector = QdrantConnector( 373 | # host=QDRANT_HOST, 374 | # port=QDRANT_PORT, 375 | # collection_name=QDRANT_COLLECTION_NAME, 376 | # vector_size=QDRANT_VECTOR_SIZE 377 | # ) 378 | 379 | 380 | # # Initialize embeddings (using same model as your embeddings) 381 | # self.embeddings = HuggingFaceEmbeddings( 382 | # model_name="Qwen/Qwen3-Embedding-4B", 383 | # model_kwargs={'device': 'cpu'}, # Use CPU for retrieval 384 | # encode_kwargs={'normalize_embeddings': True} 385 | # ) 386 | 387 | # # Initialize LangChain Qdrant vector store 388 | # self.vector_store = self._initialize_vector_store() 389 | 390 | 391 | # # Initialize LLM 392 | # self.llm = ChatOpenAI( 393 | # model="gpt-4o-mini", # Cost-effective model 394 | # temperature=0.3, 395 | # openai_api_key=self.openai_api_key 396 | # ) if self.openai_api_key else None 397 | 398 | # # Initialize output parser 399 | # self.output_parser = PydanticOutputParser(pydantic_object=LessonPlanStructure) 400 | 401 | # logger.info("✅ LangChain Lesson Generator initialized") 402 | 403 | # def _initialize_vector_store(self): 404 | # """Initialize LangChain Qdrant vector store""" 405 | # try: 406 | # vector_store = Qdrant( 407 | # client=self.qdrant_connector.client, 408 | # collection_name=QDRANT_COLLECTION_NAME, 409 | # embeddings=self.embeddings 410 | # ) 411 | # logger.info("✅ Connected to Qdrant vector store") 412 | # return vector_store 413 | # except Exception as e: 414 | # logger.error(f"❌ Failed to initialize vector store: {e}") 415 | # return None 416 | 417 | # def retrieve_relevant_content(self, topic: str, grade_level: str = None, subject: str = None, top_k: int = 8) -> List[Any]: 418 | # """Retrieve relevant educational content from Qdrant""" 419 | 420 | # # Build search query 421 | # search_query = f"{topic}" 422 | # if grade_level: 423 | # search_query += f" grade {grade_level}" 424 | # if subject: 425 | # search_query += f" {subject}" 426 | 427 | # logger.info(f"🔍 Searching for: '{search_query}'") 428 | 429 | # try: 430 | # # Perform similarity search 431 | # docs = self.vector_store.similarity_search( 432 | # query=search_query, 433 | # k=top_k, 434 | # filter=None 435 | # ) 436 | 437 | # logger.info(f"✅ Found {len(docs)} relevant documents") 438 | # return docs 439 | 440 | # except Exception as e: 441 | # logger.error(f"❌ Error retrieving content: {e}") 442 | # return [] 443 | # def retrieve_relevant_content(self, topic: str, grade_level: str = None, subject: str = None, chapter: str = None, section: str = None, top_k: int = 8) -> List[Any]: 444 | # """Retrieve relevant educational content from Qdrant using Query API""" 445 | 446 | # from qdrant_client import QdrantClient, models 447 | # from langchain_community.embeddings import HuggingFaceEmbeddings 448 | 449 | # # Build search query 450 | # search_query = f"{topic}" 451 | # if grade_level: 452 | # search_query += f" grade {grade_level}" 453 | # if subject: 454 | # search_query += f" {subject}" 455 | # if chapter: 456 | # search_query += f" chapter {chapter}" 457 | # if section: 458 | # search_query += f" section {section}" 459 | 460 | # logger.info(f"🔍 Searching for: '{search_query}'") 461 | 462 | # # Get embedding for query 463 | # query_embedding = self.embeddings.embed_query(search_query) 464 | 465 | # # Build advanced filter 466 | # must_conditions = [] 467 | # if grade_level: 468 | # must_conditions.append(models.FieldCondition(key="grade_level", match=models.MatchValue(value=grade_level))) 469 | # if subject: 470 | # must_conditions.append(models.FieldCondition(key="subject", match=models.MatchValue(value=subject))) 471 | # if chapter: 472 | # must_conditions.append(models.FieldCondition(key="chapter_number", match=models.MatchValue(value=chapter))) 473 | # if section: 474 | # must_conditions.append(models.FieldCondition(key="section_number", match=models.MatchValue(value=section))) 475 | 476 | # query_filter = models.Filter(must=must_conditions) if must_conditions else None 477 | 478 | # # Use QdrantClient Query API 479 | # try: 480 | # client = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT) 481 | # results = client.query_points( 482 | # collection_name=QDRANT_COLLECTION_NAME, 483 | # query=query_embedding, 484 | # query_filter=query_filter, 485 | # limit=top_k, 486 | # search_params=models.SearchParams(hnsw_ef=128, exact=False) 487 | # ) 488 | # logger.info(f"✅ Found {len(results)} relevant documents via Query API") 489 | # # Convert results to LangChain Document-like objects 490 | # docs = [] 491 | # for point in results: 492 | # page_content = point.payload.get("text", "") 493 | # metadata = point.payload.copy() 494 | # doc = type('Doc', (), {})() 495 | # doc.page_content = page_content 496 | # doc.metadata = metadata 497 | # doc.score = getattr(point, 'score', None) 498 | # docs.append(doc) 499 | # return docs 500 | # except Exception as e: 501 | # logger.error(f"❌ Error retrieving content via Query API: {e}") 502 | # return [] 503 | 504 | # def _build_filter(self, grade_level: str = None, subject: str = None) -> Optional[Dict[str, Any]]: 505 | # """Build Qdrant filter for search""" 506 | # filter_conditions = [] 507 | 508 | # if grade_level: 509 | # filter_conditions.append({ 510 | # "key": "metadata.grade_level", 511 | # "match": {"value": grade_level} 512 | # }) 513 | 514 | # if subject: 515 | # filter_conditions.append({ 516 | # "key": "metadata.subject", 517 | # "match": {"value": subject} 518 | # }) 519 | 520 | # if filter_conditions: 521 | # return {"must": filter_conditions} 522 | # return None 523 | # # _build_filter is now handled by Query API above 524 | 525 | # def generate_lesson_plan(self, topic: str, grade_level: str = "9-12", subject: str = "General", 526 | # duration: str = "45 minutes", custom_requirements: str = "", chapter: str = None, section: str = None) -> Dict[str, Any]: 527 | # """ 528 | # Generate a comprehensive lesson plan using RAG 529 | 530 | # Args: 531 | # topic: Main lesson topic 532 | # grade_level: Target grade level 533 | # subject: Subject area 534 | # duration: Lesson duration 535 | # custom_requirements: Additional requirements or constraints 536 | # chapter: Optional chapter number for filtering 537 | # section: Optional section number for filtering 538 | # """ 539 | # # Retrieve relevant content 540 | # relevant_docs = self.retrieve_relevant_content(topic, grade_level, subject, chapter, section) 541 | 542 | # if not relevant_docs: 543 | # return { 544 | # "error": "No relevant content found in the knowledge base", 545 | # "topic": topic, 546 | # "suggestions": ["Try a different topic", "Check grade level", "Verify subject area"] 547 | # } 548 | 549 | # # Extract context from retrieved documents 550 | # context = self._extract_context_from_docs(relevant_docs) 551 | 552 | # # Generate lesson plan using LangChain 553 | # if self.llm: 554 | # lesson_plan = self._generate_with_langchain(topic, context, grade_level, subject, duration, custom_requirements) 555 | # else: 556 | # lesson_plan = self._generate_fallback_lesson_plan(topic, context, grade_level, subject, duration) 557 | 558 | # return { 559 | # "lesson_plan": lesson_plan, 560 | # "sources": [ 561 | # { 562 | # "content": doc.page_content[:200] + "...", 563 | # "metadata": doc.metadata, 564 | # "score": getattr(doc, 'score', None) 565 | # } for doc in relevant_docs 566 | # ], 567 | # "topic": topic, 568 | # "grade_level": grade_level, 569 | # "subject": subject, 570 | # "generated_at": datetime.now().isoformat(), 571 | # "source_count": len(relevant_docs) 572 | # } 573 | 574 | # if not relevant_docs: 575 | # return { 576 | # "error": "No relevant content found in the knowledge base", 577 | # "topic": topic, 578 | # "suggestions": ["Try a different topic", "Check grade level", "Verify subject area"] 579 | # } 580 | 581 | # # Extract context from retrieved documents 582 | # context = self._extract_context_from_docs(relevant_docs) 583 | 584 | # # Generate lesson plan using LangChain 585 | # if self.llm: 586 | # lesson_plan = self._generate_with_langchain(topic, context, grade_level, subject, duration, custom_requirements) 587 | # else: 588 | # lesson_plan = self._generate_fallback_lesson_plan(topic, context, grade_level, subject, duration) 589 | 590 | # return { 591 | # "lesson_plan": lesson_plan, 592 | # "sources": [ 593 | # { 594 | # "content": doc.page_content[:200] + "...", 595 | # "metadata": doc.metadata, 596 | # "score": getattr(doc, 'score', None) 597 | # } for doc in relevant_docs 598 | # ], 599 | # "topic": topic, 600 | # "grade_level": grade_level, 601 | # "subject": subject, 602 | # "generated_at": datetime.now().isoformat(), 603 | # "source_count": len(relevant_docs) 604 | # } 605 | 606 | # def _extract_context_from_docs(self, docs: List[Any]) -> str: 607 | # """Extract and format context from retrieved documents""" 608 | # context_parts = [] 609 | 610 | # for i, doc in enumerate(docs): 611 | # content = doc.page_content.strip() 612 | # metadata = doc.metadata 613 | 614 | # # Format context with metadata 615 | # context_part = f""" 616 | # Document {i+1}: 617 | # Chapter: {metadata.get('chapter_title', 'Unknown')} 618 | # Section: {metadata.get('section_title', 'Unknown')} 619 | # Content: {content} 620 | # """ 621 | # context_parts.append(context_part) 622 | 623 | # return "\n".join(context_parts) 624 | 625 | # def _generate_with_langchain(self, topic: str, context: str, grade_level: str, 626 | # subject: str, duration: str, custom_requirements: str) -> str: 627 | # """Generate lesson plan using LangChain and LLM""" 628 | 629 | # # Create prompt template 630 | # prompt_template = """ 631 | # You are an expert educational curriculum designer. Create a comprehensive, standards-aligned lesson plan using the provided context. 632 | 633 | # CONTEXT FROM CURRICULUM: 634 | # {context} 635 | 636 | # LESSON REQUIREMENTS: 637 | # - Topic: {topic} 638 | # - Grade Level: {grade_level} 639 | # - Subject: {subject} 640 | # - Duration: {duration} 641 | # - Additional Requirements: {custom_requirements} 642 | 643 | # Generate a detailed lesson plan that includes: 644 | # 1. Clear learning objectives 645 | # 2. Engaging introduction activity 646 | # 3. Main teaching activities with timing 647 | # 4. Hands-on practice activities 648 | # 5. Assessment strategies 649 | # 6. Differentiation for diverse learners 650 | # 7. Required materials 651 | # 8. Curriculum standards alignment 652 | # 9. Extension activities 653 | 654 | # Format the lesson plan professionally with clear sections and actionable details. 655 | # Ensure the plan is age-appropriate for {grade_level} students and aligns with {subject} curriculum standards. 656 | 657 | # {format_instructions} 658 | # """ 659 | 660 | # prompt = PromptTemplate( 661 | # template=prompt_template, 662 | # input_variables=["context", "topic", "grade_level", "subject", "duration", "custom_requirements"], 663 | # partial_variables={"format_instructions": self.output_parser.get_format_instructions()} 664 | # ) 665 | 666 | # try: 667 | # # Create chain 668 | # chain = prompt | self.llm | self.output_parser 669 | 670 | # # Generate lesson plan 671 | # result = chain.invoke({ 672 | # "context": context, 673 | # "topic": topic, 674 | # "grade_level": grade_level, 675 | # "subject": subject, 676 | # "duration": duration, 677 | # "custom_requirements": custom_requirements 678 | # }) 679 | 680 | # # Format as readable lesson plan 681 | # return self._format_lesson_plan_output(result) 682 | 683 | # except Exception as e: 684 | # logger.error(f"❌ Error generating lesson plan with LangChain: {e}") 685 | # return self._generate_fallback_lesson_plan(topic, context, grade_level, subject, duration) 686 | 687 | # def _format_lesson_plan_output(self, structured_plan: LessonPlanStructure) -> str: 688 | # """Format structured lesson plan into readable text""" 689 | 690 | # lesson_plan = f""" 691 | # # {structured_plan.title} 692 | 693 | # ## 📚 Course Information 694 | # - **Subject:** {structured_plan.subject} 695 | # - **Grade Level:** {structured_plan.grade_level} 696 | # - **Duration:** {structured_plan.duration} 697 | 698 | # ## 🎯 Learning Objectives 699 | # {chr(10).join(f"{i+1}. {obj}" for i, obj in enumerate(structured_plan.objectives))} 700 | 701 | # ## 📋 Materials Required 702 | # {chr(10).join(f"- {material}" for material in structured_plan.materials)} 703 | 704 | # ## 🚀 Introduction Activity ({structured_plan.duration}) 705 | # {structured_plan.introduction} 706 | 707 | # ## 📖 Main Teaching Activities 708 | # {chr(10).join(f"### Activity {i+1}{chr(10)}{activity}" for i, activity in enumerate(structured_plan.main_activities))} 709 | 710 | # ## ✅ Assessment Strategy 711 | # {structured_plan.assessment} 712 | 713 | # ## 🎭 Differentiation Strategies 714 | # {structured_plan.differentiation} 715 | 716 | # ## 📏 Curriculum Standards Alignment 717 | # {chr(10).join(f"- {standard}" for standard in structured_plan.standards)} 718 | 719 | # ## 🔄 Extension Activities 720 | # - Advanced practice problems 721 | # - Research projects on related topics 722 | # - Real-world application assignments 723 | # - Peer teaching opportunities 724 | 725 | # --- 726 | # *Generated by EduPlan AI - LangChain RAG System* 727 | # *Based on curriculum embeddings from Qdrant vector database* 728 | # """ 729 | 730 | # return lesson_plan.strip() 731 | 732 | # def _generate_fallback_lesson_plan(self, topic: str, context: str, grade_level: str, 733 | # subject: str, duration: str) -> str: 734 | # """Fallback lesson plan generation without LLM""" 735 | 736 | # lesson_plan = f""" 737 | # # Lesson Plan: {topic} 738 | 739 | # ## 📚 Course Information 740 | # - **Subject:** {subject} 741 | # - **Grade Level:** {grade_level} 742 | # - **Duration:** {duration} 743 | 744 | # ## 🎯 Learning Objectives 745 | # 1. Understand the fundamental concepts of {topic} 746 | # 2. Apply learned principles to solve related problems 747 | # 3. Demonstrate comprehension through structured activities 748 | # 4. Connect new knowledge to existing curriculum 749 | 750 | # ## 📋 Materials Required 751 | # - Textbook and reference materials 752 | # - Whiteboard/markers or presentation tools 753 | # - Worksheets and practice exercises 754 | # - Assessment materials 755 | 756 | # ## 🚀 Introduction Activity (10 minutes) 757 | # - Engage students with a relevant real-world example 758 | # - Connect to previous knowledge 759 | # - Present learning objectives 760 | # - Set expectations for the lesson 761 | 762 | # ## 📖 Main Teaching Activities 763 | 764 | # ### Direct Instruction (15 minutes) 765 | # - Present core concepts with clear examples 766 | # - Use visual aids and demonstrations 767 | # - Encourage student questions and participation 768 | 769 | # ### Guided Practice (15 minutes) 770 | # - Work through examples together as a class 771 | # - Provide step-by-step guidance 772 | # - Address common misconceptions 773 | 774 | # ### Independent Practice (10 minutes) 775 | # - Students work individually on practice problems 776 | # - Circulate to provide individual support 777 | # - Monitor understanding and progress 778 | 779 | # ## ✅ Assessment Strategy 780 | # - Formative assessment through observation and questioning 781 | # - Exit ticket with key concept check 782 | # - Homework assignment for reinforcement 783 | 784 | # ## 🎭 Differentiation Strategies 785 | # - Provide additional support for struggling students 786 | # - Offer extension activities for advanced learners 787 | # - Use flexible grouping based on readiness 788 | 789 | # ## 📏 Curriculum Standards Alignment 790 | # - Aligned with {subject} curriculum standards 791 | # - Meets grade {grade_level} learning expectations 792 | # - Supports progression to next concepts 793 | 794 | # --- 795 | # *Generated by EduPlan AI - Fallback Mode* 796 | # *Note: For enhanced lesson plans, configure OpenAI API key* 797 | # """ 798 | 799 | # return lesson_plan.strip() 800 | 801 | # def search_similar_topics(self, topic: str, top_k: int = 5) -> List[Dict[str, Any]]: 802 | # """Search for similar topics in the knowledge base""" 803 | # try: 804 | # docs = self.vector_store.similarity_search(topic, k=top_k) 805 | # return [ 806 | # { 807 | # "topic": doc.metadata.get("section_title", "Unknown"), 808 | # "chapter": doc.metadata.get("chapter_title", "Unknown"), 809 | # "content_preview": doc.page_content[:150] + "...", 810 | # "similarity_score": getattr(doc, 'score', None) 811 | # } for doc in docs 812 | # ] 813 | # except Exception as e: 814 | # logger.error(f"❌ Error searching similar topics: {e}") 815 | # return [] 816 | 817 | 818 | # def main(): 819 | # """Example usage of the LangChain lesson generator""" 820 | 821 | # # Initialize generator 822 | # generator = LangChainLessonGenerator() 823 | 824 | # # Example lesson generation 825 | # topic = "Introduction to Atomic Theory" 826 | # grade_level = "9-10" 827 | # subject = "Chemistry" 828 | 829 | # print(f"🎓 Generating lesson plan for: {topic}") 830 | # print(f"📚 Grade Level: {grade_level} | Subject: {subject}") 831 | # print("-" * 60) 832 | 833 | # result = generator.generate_lesson_plan( 834 | # topic=topic, 835 | # grade_level=grade_level, 836 | # subject=subject, 837 | # duration="50 minutes" 838 | # ) 839 | 840 | # if "error" in result: 841 | # print(f"❌ Error: {result['error']}") 842 | # print("💡 Suggestions:") 843 | # for suggestion in result.get("suggestions", []): 844 | # print(f" - {suggestion}") 845 | # else: 846 | # print(result["lesson_plan"]) 847 | # print(f"\n📊 Sources used: {result['source_count']} documents") 848 | # print(f"⏰ Generated at: {result['generated_at']}") 849 | 850 | 851 | # if __name__ == "__main__": 852 | # main() 853 | --------------------------------------------------------------------------------