├── .env
├── config.py
├── requirements.txt
├── docker-compose.yml
├── .gitignore
├── Output_Lesson_Plans
    └── MCQs_energy.json
├── readme.md
├── qdrant_connector.py
├── embedding_model.py
├── process_improved_data.py
└── lesson_plan_generator.py


/.env:
--------------------------------------------------------------------------------
1 | OPENAI_API_KEY=<ENTER_Your_API_Key>
2 | QDRANT_URL=http://localhost:6333
3 | QDRANT_HOST=localhost
4 | QDRANT_PORT=6333
5 | QDRANT_COLLECTION_NAME=science_9_collection
6 | QDRANT_VECTOR_SIZE=2560
7 | 


--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
1 | # Default configuration for Qdrant Vector Database
2 | QDRANT_COLLECTION_NAME = "science_9_collection"
3 | QDRANT_VECTOR_SIZE = 4096
4 | QDRANT_HOST = "localhost"
5 | QDRANT_PORT = 6333
6 | 
7 | # Default batch size for processing
8 | BATCH_SIZE = 10
9 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # Core dependencies
 2 | numpy>=1.20.0
 3 | pandas>=1.3.0
 4 | matplotlib>=3.4.0
 5 | torch>=2.0.0
 6 | transformers>=4.30.0
 7 | httpx>=0.24.0
 8 | 
 9 | # Embedding models
10 | transformers>=4.51.0
11 | sentence-transformers>=2.7.0
12 | 
13 | # Vector database
14 | qdrant-client>=1.7.0
15 | 
16 | # LangChain for RAG
17 | langchain>=0.1.0
18 | langchain-community>=0.0.10
19 | langchain-core>=0.1.0
20 | 
21 | # Utils
22 | tqdm>=4.65.0
23 | python-dotenv>=1.0.0
24 | pydantic>=2.0.0
25 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | # EduPlan AI - Qdrant Vector Database
 2 | services:
 3 |   qdrant:
 4 |     image: qdrant/qdrant:latest
 5 |     ports:
 6 |       - "6333:6333"
 7 |       - "6334:6334"
 8 |     volumes:
 9 |       - qdrant-storage:/qdrant/storage  # Using named volume for FUSE compatibility
10 |     environment:
11 |       - QDRANT__STORAGE__WAL_CAPACITY_MB=32
12 |       - QDRANT__STORAGE__WAL_SEGMENTS_AHEAD=0
13 |     restart: unless-stopped
14 | volumes:
15 |   qdrant-storage:
16 |     driver: local    
17 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Python
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | *.so
 6 | .Python
 7 | build/
 8 | develop-eggs/
 9 | dist/
10 | downloads/
11 | eggs/
12 | .eggs/
13 | lib/
14 | lib64/
15 | parts/
16 | sdist/
17 | var/
18 | wheels/
19 | pip-wheel-metadata/
20 | share/python-wheels/
21 | *.egg-info/
22 | .installed.cfg
23 | *.egg
24 | MANIFEST
25 | 
26 | # Virtual environments
27 | env/
28 | venv/
29 | ENV/
30 | env.bak/
31 | venv.bak/
32 | eduplan_env/
33 | 
34 | # IDEs
35 | .vscode/
36 | .env
37 | .idea/
38 | *.swp
39 | *.swo
40 | 
41 | # Project specific
42 | extracted_data/
43 | rag_data/
44 | *.log
45 | .env
46 | 
47 | # Qdrant data
48 | # qdrant_storage/
49 | qdrant_storage/collections/midjourney
50 | qdrant_storage/collections/test_collection
51 | 


--------------------------------------------------------------------------------
/Output_Lesson_Plans/MCQs_energy.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "mcqs": [
 3 |     {
 4 |       "question": "Which of the following is NOT a form of energy mentioned in the provided curriculum content?",
 5 |       "options": {
 6 |         "A": "Mechanical energy",
 7 |         "B": "Nuclear energy",
 8 |         "C": "Heat energy",
 9 |         "D": "Chemical energy"
10 |       },
11 |       "correct_answer": "Nuclear energy",
12 |       "explanation": "The provided curriculum content lists mechanical, heat, chemical, electrical, and light energy, but not nuclear energy."
13 |     },
14 |     {
15 |       "question": "According to the curriculum content, what is the biggest natural source of energy for us?",
16 |       "options": {
17 |         "A": "The Earth’s core",
18 |         "B": "The Sun",
19 |         "C": "The Moon",
20 |         "D": "Nuclear reactions"
21 |       },
22 |       "correct_answer": "The Sun",
23 |       "explanation": "The curriculum content states that the Sun is the biggest natural source of energy for us."
24 |     },
25 |     {
26 |       "question": "A lamp consumes 1000 J of electrical energy in 10 seconds. What is its power?",
27 |       "options": {
28 |         "A": "10 W",
29 |         "B": "100 W",
30 |         "C": "1000 W",
31 |         "D": "10000 W"
32 |       },
33 |       "correct_answer": "100 W",
34 |       "explanation": "Power is calculated as energy divided by time. Therefore, 1000 J / 10 s = 100 W."
35 |     },
36 |     {
37 |       "question": "Which of the following energy sources is NOT directly derived from the Sun?",
38 |       "options": {
39 |         "A": "Solar energy",
40 |         "B": "Wind energy",
41 |         "C": "Geothermal energy",
42 |         "D": "Hydroelectric energy"
43 |       },
44 |       "correct_answer": "Geothermal energy",
45 |       "explanation": "Geothermal energy comes from the Earth's interior, not directly from the Sun, unlike solar, wind, and hydroelectric energy."
46 |     },
47 |     {
48 |       "question": "What kind of energy conversion is involved in the formation of coal and petroleum?",
49 |       "options": {
50 |         "A": "Mechanical to chemical",
51 |         "B": "Chemical to electrical",
52 |         "C": "Solar to chemical",
53 |         "D": "Nuclear to thermal"
54 |       },
55 |       "correct_answer": "Solar to chemical",
56 |       "explanation": "Coal and petroleum are formed from ancient organic matter that originally captured solar energy through photosynthesis, converting it into chemical energy."
57 |     }
58 |   ],
59 |   "topic": "Energy",
60 |   "chapter": null,
61 |   "section": null,
62 |   "generated_at": "2025-09-01T18:37:49.396834",
63 |   "source_count": 8,
64 |   "sources_preview": [
65 |     {
66 |       "chapter": "Unknown",
67 |       "section": "Unknown",
68 |       "content_type": "Unknown",
69 |       "preview": "Luckily the world we live in provides energy in many different forms. The various forms include mech..."
70 |     },
71 |     {
72 |       "chapter": "Unknown",
73 |       "section": "Unknown",
74 |       "content_type": "Unknown",
75 |       "preview": "Life is impossible without energy. The demand for energy is ever increasing. Where do we get energy ..."
76 |     },
77 |     {
78 |       "chapter": "Unknown",
79 |       "section": "Unknown",
80 |       "content_type": "Unknown",
81 |       "preview": "Questions Section: Questions\nQuestions:\n1. 1. What is power?\n2. 2. Define 1 watt of power.\n3. 3. A l..."
82 |     }
83 |   ]
84 | }


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
 1 | # EduPlan AI: Automated MCQ Generator with RAG & Qdrant
 2 | 
 3 | ## Overview
 4 | EduPlan AI is an automated lesson and MCQ generator for educational content. It uses Retrieval-Augmented Generation (RAG) with a Qdrant vector database and OpenAI's GPT models to generate high-quality multiple-choice questions (MCQs) from your curriculum PDFs and processed data.
 5 | 
 6 | ## Features
 7 | - **Semantic Search**: Uses Qdrant vector database for fast, relevant retrieval of curriculum content.
 8 | - **MCQ Generation**: Generates exactly 5 MCQs for any topic using OpenAI's GPT-4o.
 9 | - **Flexible Filtering**: Filter MCQs by chapter, section, or content type.
10 | - **Embeddings**: Supports NVIDIA Qwen3-Embedding-4B and custom embedding pipelines.
11 | - **Easy Output**: MCQs are saved in `Output_Lesson_Plans` as JSON files for easy use.
12 | 
13 | ## Folder Structure
14 | ```
15 | Qwen3_PDF/
16 | ├── config.py                # Project configuration (Qdrant host, port, etc.)
17 | ├── docker-compose.yml       # Qdrant database container setup
18 | ├── embedding_model.py       # Embedding pipeline (Qwen3/NVIDIA)
19 | ├── lesson_plan_generator.py # Main MCQ generator script
20 | ├── process_improved_data.py # Data processing and embedding script
21 | ├── qdrant_connector.py      # Qdrant database connector
22 | ├── requirements.txt         # Python dependencies
23 | ├── Output_Lesson_Plans/     # Generated MCQ JSON files
24 | ├── rag_data/                # Raw and processed curriculum PDFs
25 | │   ├── raw/                 # Original PDFs
26 | │   └── processed/           # Processed data (if any)
27 | ```
28 | 
29 | ## Setup
30 | 1. **Clone the repository**
31 | 2. **Install dependencies**:
32 |    ```sh
33 |    pip install -r requirements.txt
34 |    ```
35 | 3. **Start Qdrant database**:
36 |    ```sh
37 |    docker-compose up -d
38 |    ```
39 | 4. **Configure environment variables**:
40 |    - Create a `.env` file in the project root with your OpenAI API key:
41 |      ```env
42 |      OPENAI_API_KEY=sk-...
43 |      QDRANT_HOST=localhost
44 |      QDRANT_PORT=6333
45 |      QDRANT_COLLECTION_NAME=science_9_collection
46 |      QDRANT_VECTOR_SIZE=2560
47 |      ```
48 | 
49 | ## Usage
50 | ### 1. Process Curriculum Data
51 | If you have new or improved curriculum data, run:
52 | ```sh
53 | python process_improved_data.py
54 | ```
55 | This will generate embeddings and store them in Qdrant.
56 | 
57 | ### 2. Generate MCQs
58 | Run the MCQ generator for any topic:
59 | ```sh
60 | python lesson_plan_generator.py --topic "Energy" --chapter "10" --section "10.1"
61 | ```
62 | - The generated MCQs will be saved in `Output_Lesson_Plans/MCQs_energy.json`.
63 | - You can omit `--chapter` and `--section` for broader search.
64 | 
65 | ## Output Format
66 | Each MCQ JSON file contains:
67 | - 5 MCQs with question, options (A-D), correct answer, and explanation
68 | - Metadata: topic, chapter, section, source preview
69 | 
70 | ## Customization
71 | - **Embeddings**: You can use your own embedding model by editing `embedding_model.py`.
72 | - **Qdrant Collection**: Change collection name/vector size in `config.py` and `.env`.
73 | - **MCQ Format**: Edit `lesson_plan_generator.py` for custom output formatting.
74 | 
75 | ## Troubleshooting
76 | - Ensure Qdrant is running (`docker-compose up -d`).
77 | - Make sure your `.env` file contains a valid OpenAI API key.
78 | - Check `requirements.txt` for missing dependencies.
79 | - For import errors, run scripts from the project root.
80 | 
81 | ## License
82 | MIT License
83 | 
84 | ## Credits
85 | - [Qdrant Vector Database](https://qdrant.tech/)
86 | - [LangChain](https://langchain.com/)
87 | - [OpenAI GPT](https://platform.openai.com/)
88 | - [NVIDIA Qwen3 Embedding](https://huggingface.co/Qwen/Qwen3-Embedding-4B)
89 | 


--------------------------------------------------------------------------------
/qdrant_connector.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Qdrant vector database connector for EduPlan AI.
  3 | This module provides a connector for interacting with the Qdrant vector database.
  4 | """
  5 | 
  6 | import logging
  7 | import httpx
  8 | from typing import List, Dict, Any, Optional, Union
  9 | from qdrant_client import QdrantClient
 10 | from qdrant_client.http import models
 11 | from qdrant_client.http.exceptions import UnexpectedResponse
 12 | 
 13 | # Configure logging
 14 | logging.basicConfig(
 15 |     level=logging.INFO,
 16 |     format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
 17 | )
 18 | logger = logging.getLogger(__name__)
 19 | 
 20 | class QdrantConnector:
 21 |     """Connector for interacting with the Qdrant vector database."""
 22 |     
 23 |     def __init__(self, host: str = "localhost", port: int = 6333, 
 24 |                  collection_name: str = "eduplan", vector_size: int = 4096):
 25 |         """
 26 |         Initialize the Qdrant connector.
 27 |         
 28 |         Args:
 29 |             host: Qdrant server hostname
 30 |             port: Qdrant server port
 31 |             collection_name: Name of the collection to use
 32 |             vector_size: Dimensionality of the vectors to store
 33 |         """
 34 |         self.host = host
 35 |         self.port = port
 36 |         self.collection_name = collection_name
 37 |         self.vector_size = vector_size
 38 |         
 39 |         # Initialize client
 40 |         try:
 41 |             self.client = QdrantClient(host=host, port=port)
 42 |             logger.debug(f"Connected to Qdrant at {host}:{port}")
 43 |         except Exception as e:
 44 |             logger.error(f"Error connecting to Qdrant: {e}")
 45 |             raise
 46 |             
 47 |     def recreate_collection(self) -> bool:
 48 |         """
 49 |         Delete collection if it exists and create a new one.
 50 |         
 51 |         Returns:
 52 |             True if successful, False otherwise
 53 |         """
 54 |         try:
 55 |             # Check if collection exists
 56 |             collections = self.client.get_collections().collections
 57 |             collection_names = [collection.name for collection in collections]
 58 |             
 59 |             if self.collection_name in collection_names:
 60 |                 # Delete existing collection
 61 |                 self.client.delete_collection(collection_name=self.collection_name)
 62 |                 print(f"🗑️ Deleted existing collection: {self.collection_name}")
 63 |             
 64 |             # Create new collection
 65 |             self.client.create_collection(
 66 |                 collection_name=self.collection_name,
 67 |                 vectors_config=models.VectorParams(
 68 |                     size=self.vector_size,
 69 |                     distance=models.Distance.COSINE
 70 |                 )
 71 |             )
 72 |             print(f"✅ Created new collection: {self.collection_name}")
 73 |             return True
 74 |             
 75 |         except Exception as e:
 76 |             logger.error(f"Error recreating collection: {e}")
 77 |             return False
 78 |             
 79 |     def get_collection_info(self) -> Dict[str, Any]:
 80 |         """
 81 |         Get information about the collection.
 82 |         
 83 |         Returns:
 84 |             Dictionary containing collection information
 85 |         """
 86 |         try:
 87 |             return self.client.get_collection(collection_name=self.collection_name)
 88 |         except Exception as e:
 89 |             logger.error(f"Error getting collection info: {e}")
 90 |             return {}
 91 |             
 92 |     def insert_documents(self, documents: List[Dict], embeddings: List[List[float]], batch_size: int = 2) -> bool:
 93 |         """
 94 |         Insert documents with embeddings into Qdrant.
 95 |         
 96 |         Args:
 97 |             documents: List of document dictionaries with 'id', 'text', and 'metadata'
 98 |             embeddings: List of embedding vectors (must match documents length)
 99 |             batch_size: Number of documents to insert at once
100 |             
101 |         Returns:
102 |             True if insertion was successful
103 |         """
104 |         try:
105 |             if len(documents) != len(embeddings):
106 |                 logger.error(f"Document count ({len(documents)}) does not match embeddings count ({len(embeddings)})")
107 |                 return False
108 |                 
109 |             logger.info(f"Inserting {len(documents)} documents into collection '{self.collection_name}'")
110 |             
111 |             # Process in batches
112 |             for i in range(0, len(documents), batch_size):
113 |                 batch_docs = documents[i:i+batch_size]
114 |                 batch_embeddings = embeddings[i:i+batch_size]
115 |                 
116 |                 # Prepare points for insertion
117 |                 points = []
118 |                 for doc, emb in zip(batch_docs, batch_embeddings):
119 |                     # Ensure ID is a string or integer (not a list)
120 |                     doc_id = doc.get("id")
121 |                     if isinstance(doc_id, list):
122 |                         # If ID is a list, convert to string
123 |                         doc_id = str(doc_id)
124 |                     
125 |                     point = {
126 |                         "id": doc_id,
127 |                         "vector": emb,
128 |                         "payload": {
129 |                             "text": doc.get("text", ""),
130 |                             "metadata": doc.get("metadata", {})
131 |                         }
132 |                     }
133 |                     points.append(point)
134 |                 
135 |                 # Insert batch
136 |                 self.client.upsert(
137 |                     collection_name=self.collection_name,
138 |                     points=points
139 |                 )
140 |                 
141 |             logger.info(f"Successfully inserted {len(documents)} documents")
142 |             return True
143 |             
144 |         except Exception as e:
145 |             logger.error(f"Error inserting documents: {str(e)}")
146 |             return False
147 |             
148 |     def search_documents(self, query_vector: List[float], limit: int = 5, 
149 |                         filter: Optional[Dict[str, Any]] = None) -> List[Dict[str, Any]]:
150 |         """
151 |         Search for similar documents in the collection.
152 |         
153 |         Args:
154 |             query_vector: Query embedding vector
155 |             limit: Maximum number of results to return
156 |             filter: Optional filter to apply to the search
157 |             
158 |         Returns:
159 |             List of matching documents
160 |         """
161 |         try:
162 |             return self.client.search(
163 |                 collection_name=self.collection_name,
164 |                 query_vector=query_vector,
165 |                 limit=limit,
166 |                 query_filter=filter
167 |             )
168 |         except Exception as e:
169 |             logger.error(f"Error searching documents: {e}")
170 |             return []
171 |             
172 |     def delete_document(self, document_id: Union[str, int]) -> bool:
173 |         """
174 |         Delete a document from the collection.
175 |         
176 |         Args:
177 |             document_id: ID of the document to delete
178 |             
179 |         Returns:
180 |             True if successful, False otherwise
181 |         """
182 |         try:
183 |             self.client.delete(
184 |                 collection_name=self.collection_name,
185 |                 points_selector=models.PointIdsList(
186 |                     points=[document_id]
187 |                 )
188 |             )
189 |             return True
190 |         except Exception as e:
191 |             logger.error(f"Error deleting document: {e}")
192 |             return False
193 |             
194 |     def get_document(self, document_id: Union[str, int]) -> Optional[Dict[str, Any]]:
195 |         """
196 |         Get a document from the collection by ID.
197 |         
198 |         Args:
199 |             document_id: ID of the document to get
200 |             
201 |         Returns:
202 |             Document if found, None otherwise
203 |         """
204 |         try:
205 |             results = self.client.retrieve(
206 |                 collection_name=self.collection_name,
207 |                 ids=[document_id]
208 |             )
209 |             return results[0] if results else None
210 |         except Exception as e:
211 |             logger.error(f"Error getting document: {e}")
212 |             return None
213 | 
214 | # Example usage
215 | if __name__ == "__main__":
216 |     # Test the connector
217 |     connector = QdrantConnector(
218 |         host="localhost",
219 |         port=6333,
220 |         collection_name="test_collection",
221 |         vector_size=4
222 |     )
223 |     
224 |     # Create a test collection
225 |     connector.recreate_collection()
226 |     
227 |     # Insert test documents
228 |     test_docs = [
229 |         {
230 |             "id": 1,
231 |             "vector": [0.1, 0.2, 0.3, 0.4],
232 |             "payload": {"text": "Test document 1", "metadata": {"source": "test"}}
233 |         },
234 |         {
235 |             "id": 2,
236 |             "vector": [0.2, 0.3, 0.4, 0.5],
237 |             "payload": {"text": "Test document 2", "metadata": {"source": "test"}}
238 |         }
239 |     ]
240 |     
241 |     connector.insert_documents(test_docs)
242 |     
243 |     # Search for similar documents
244 |     results = connector.search_documents([0.1, 0.2, 0.3, 0.4], limit=1)
245 |     print(f"Search results: {results}")
246 | 
247 | 


--------------------------------------------------------------------------------
/embedding_model.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Embedding model for EduPlan AI.
  3 | This module provides the NVEmbedPipeline class for generating embeddings
  4 | using the NVIDIA NV-Embed model.
  5 | """
  6 | 
  7 | import logging
  8 | import time
  9 | import torch
 10 | from typing import List, Union, Dict, Any
 11 | from transformers import AutoModel, AutoTokenizer
 12 | import numpy as np
 13 | 
 14 | # Configure logging
 15 | logging.basicConfig(
 16 |     level=logging.INFO,
 17 |     format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
 18 | )
 19 | logger = logging.getLogger(__name__)
 20 | 
 21 | class NVEmbedPipeline:
 22 |     """Pipeline for generating embeddings using NVIDIA NV-Embed."""
 23 |     
 24 |     def __init__(self, model_name: str = "Qwen/Qwen3-Embedding-4B", device: str = None):
 25 |         """Initialize the NVEmbedPipeline."""
 26 |         self.model_name = model_name
 27 |         
 28 |         # Use CUDA if available, otherwise fall back to CPU
 29 |         if device is None:
 30 |             self.device = "cuda:0" if torch.cuda.is_available() else "cpu"  # Use main discrete GPU
 31 |         else:
 32 |             self.device = device
 33 |             
 34 |         # Load model and tokenizer
 35 |         self._load_model()
 36 |         
 37 |     def _load_model(self):
 38 |         """Load the NV-Embed model and tokenizer."""
 39 |         try:
 40 |             # GPU memory management (ADD THIS)
 41 |             if self.device.startswith("cuda"):
 42 |                 torch.cuda.set_per_process_memory_fraction(1.0)  # Use only 70% of VRAM
 43 |                 torch.cuda.empty_cache()
 44 | 
 45 |             # Print loading message
 46 |             print(f"🔄 Loading Qwen/Qwen3-Embedding-4B: {self.model_name}")
 47 |             print(f"🎯 Using device: {self.device}")
 48 |             
 49 |             # Use half precision for GPU to save memory
 50 |             dtype = torch.float16 if self.device.startswith("cuda") else torch.float32
 51 | 
 52 |             # dtype = torch.float16 
 53 |             # Load tokenizer with trust_remote_code=True for NVIDIA models
 54 |             self.tokenizer = AutoTokenizer.from_pretrained(
 55 |                 self.model_name, 
 56 |                 trust_remote_code=True
 57 |             )
 58 |             
 59 |             # Load model with memory optimization (ADD low_cpu_mem_usage=True)
 60 |             self.model = AutoModel.from_pretrained(
 61 |                 self.model_name, 
 62 |                 trust_remote_code=True,
 63 |                 torch_dtype=dtype,
 64 |                 low_cpu_mem_usage=True  # ← ADD THIS
 65 |             )
 66 |             
 67 |             # Move model to device
 68 |             self.model.to(self.device)
 69 |             
 70 |             # Save embedding dimension from config
 71 |             self.embedding_dim = 2560  # Hard-coded for NV-Embed-v2
 72 |             self.vector_size = self.embedding_dim  # Add this for compatibility
 73 |             
 74 |             # Print success message
 75 |             print(f"✅ Qwen/Qwen3-Embedding-4B loaded successfully!")
 76 |             print(f"   📊 Vector size: {self.embedding_dim}")
 77 |             print(f"   🎯 Device: {self.device}")
 78 |             print(f"   📏 Model dtype: {dtype}")
 79 |             
 80 |         except Exception as e:
 81 |             logger.error(f"Error loading NV-Embed model: {e}")
 82 |             raise
 83 |     
 84 |     def embed_texts(self, texts: List[str], batch_size: int = 1) -> List[List[float]]:
 85 |         """Generate embeddings for a list of texts."""
 86 |         print(f"🔄 Generating embeddings for {len(texts)} texts...")
 87 |         
 88 |         embeddings = []
 89 |         
 90 |         # Process in batches
 91 |         for i in range(0, len(texts), batch_size):
 92 |             batch_texts = texts[i:i+batch_size]
 93 |             
 94 |             # Tokenize batch
 95 |             inputs = self.tokenizer(
 96 |                 batch_texts, 
 97 |                 padding=True, 
 98 |                 truncation=True, 
 99 |                 max_length=512,
100 |                 return_tensors="pt"
101 |             ).to(self.device)
102 |             
103 |             # Cast float tensors to model dtype
104 |             for key in inputs:
105 |                 if torch.is_floating_point(inputs[key]):
106 |                     inputs[key] = inputs[key].to(self.model.dtype)
107 |             
108 |             try:
109 |                 # Generate embeddings with no gradient tracking
110 |                 with torch.no_grad():
111 |                     outputs = self.model(**inputs)
112 |                 
113 |                 # Get embeddings - simpler approach with less memory usage
114 |                 if isinstance(outputs, dict):
115 |                     if "sentence_embeddings" in outputs:
116 |                         token_embeddings = outputs["sentence_embeddings"]
117 |                         
118 |                         # Check shape - don't print to reduce log spam
119 |                         if len(token_embeddings.shape) == 3:
120 |                             # Memory-efficient pooling: process one sequence at a time
121 |                             batch_embeddings = []
122 |                             for seq_idx in range(token_embeddings.shape[0]):
123 |                                 # Get single sequence tokens and its mask
124 |                                 seq_tokens = token_embeddings[seq_idx]
125 |                                 seq_mask = inputs["attention_mask"][seq_idx]
126 |                                 
127 |                                 # Apply mask and mean only for this sequence
128 |                                 masked_tokens = seq_tokens * seq_mask.unsqueeze(-1)
129 |                                 # Sum and divide by non-zero mask elements
130 |                                 sum_tokens = torch.sum(masked_tokens, dim=0)
131 |                                 token_count = torch.sum(seq_mask).item()
132 |                                 if token_count > 0:
133 |                                     mean_embedding = (sum_tokens / token_count).cpu().numpy()
134 |                                 else:
135 |                                     # Fallback if no tokens (shouldn't happen)
136 |                                     mean_embedding = torch.zeros(self.embedding_dim).cpu().numpy()
137 |                                     
138 |                                 batch_embeddings.append(mean_embedding)
139 |                             
140 |                             # Convert to numpy array
141 |                             batch_embeddings = np.array(batch_embeddings)
142 |                         else:
143 |                             # Already sentence-level embeddings
144 |                             batch_embeddings = token_embeddings.cpu().numpy()
145 |                     else:
146 |                         # Alternative keys
147 |                         if "last_hidden_state" in outputs:
148 |                             # Similar memory-efficient approach for last_hidden_state
149 |                             token_embeddings = outputs["last_hidden_state"]
150 |                             batch_embeddings = []
151 |                             for seq_idx in range(token_embeddings.shape[0]):
152 |                                 seq_tokens = token_embeddings[seq_idx]
153 |                                 seq_mask = inputs["attention_mask"][seq_idx]
154 |                                 masked_tokens = seq_tokens * seq_mask.unsqueeze(-1)
155 |                                 sum_tokens = torch.sum(masked_tokens, dim=0)
156 |                                 token_count = torch.sum(seq_mask).item()
157 |                                 if token_count > 0:
158 |                                     mean_embedding = (sum_tokens / token_count).cpu().numpy()
159 |                                 else:
160 |                                     mean_embedding = torch.zeros(self.embedding_dim).cpu().numpy()
161 |                                 batch_embeddings.append(mean_embedding)
162 |                             
163 |                             batch_embeddings = np.array(batch_embeddings)
164 |                         else:
165 |                             # Try to find any usable tensor
166 |                             usable_key = None
167 |                             for key, value in outputs.items():
168 |                                 if isinstance(value, torch.Tensor) and value.dim() >= 2:
169 |                                     usable_key = key
170 |                                     break
171 |                             
172 |                             if usable_key:
173 |                                 logger.info(f"Using fallback key: {usable_key}")
174 |                                 batch_embeddings = outputs[usable_key].cpu().numpy()
175 |                             else:
176 |                                 raise ValueError("Cannot find usable embeddings in model output")
177 |                 else:
178 |                     # Direct tensor - use memory-efficient approach
179 |                     token_embeddings = outputs
180 |                     batch_embeddings = []
181 |                     for seq_idx in range(token_embeddings.shape[0]):
182 |                         seq_tokens = token_embeddings[seq_idx]
183 |                         seq_mask = inputs["attention_mask"][seq_idx]
184 |                         masked_tokens = seq_tokens * seq_mask.unsqueeze(-1)
185 |                         sum_tokens = torch.sum(masked_tokens, dim=0)
186 |                         token_count = torch.sum(seq_mask).item()
187 |                         if token_count > 0:
188 |                             mean_embedding = (sum_tokens / token_count).cpu().numpy()
189 |                         else:
190 |                             mean_embedding = torch.zeros(self.embedding_dim).cpu().numpy()
191 |                         batch_embeddings.append(mean_embedding)
192 |                     
193 |                     batch_embeddings = np.array(batch_embeddings)
194 |                 
195 |                 # Add to results
196 |                 embeddings.extend(batch_embeddings)
197 |                 
198 |             except RuntimeError as e:
199 |                 if 'out of memory' in str(e):
200 |                     # Out of memory, clean up and try with smaller batch
201 |                     torch.cuda.empty_cache()
202 |                     logger.warning(f"GPU out of memory, reducing batch size and retrying...")
203 |                     
204 |                     if batch_size > 1:
205 |                         # Try with batch_size of 1
206 |                         for text in batch_texts:
207 |                             try:
208 |                                 # Process one text at a time
209 |                                 single_embedding = self.embed_query(text)
210 |                                 embeddings.append(single_embedding)
211 |                             except Exception as inner_e:
212 |                                 logger.error(f"Error processing single text: {inner_e}")
213 |                                 # Add zeros as fallback
214 |                                 embeddings.append([0.0] * self.embedding_dim)
215 |                     else:
216 |                         # Even batch_size=1 failed, add zeros as fallback
217 |                         logger.error(f"Cannot process even with batch_size=1: {e}")
218 |                         for _ in batch_texts:
219 |                             embeddings.append([0.0] * self.embedding_dim)
220 |                 else:
221 |                     # Other error
222 |                     raise
223 |                 
224 |             # Log progress
225 |             print(f"   📊 Processed {min(i+batch_size, len(texts))}/{len(texts)} texts ({self.device.upper()})")
226 |             
227 |             # Free GPU memory
228 |             if self.device.startswith("cuda"):
229 |                 torch.cuda.empty_cache()
230 |                 
231 |         # Convert to lists for consistent output
232 |         result = []
233 |         for emb in embeddings:
234 |             # Convert to list and ensure correct dimension
235 |             if isinstance(emb, np.ndarray):
236 |                 emb_list = emb.tolist()
237 |             else:
238 |                 emb_list = list(emb)
239 |             
240 |             # Check dimensions
241 |             if len(emb_list) != self.vector_size:
242 |                 logger.warning(f"Fixing dimension: {len(emb_list)} → {self.vector_size}")
243 |                 if len(emb_list) < self.vector_size:
244 |                     # Pad with zeros
245 |                     emb_list = emb_list + [0.0] * (self.vector_size - len(emb_list))
246 |                 else:
247 |                     # Truncate
248 |                     emb_list = emb_list[:self.vector_size]
249 |             
250 |             result.append(emb_list)
251 |         
252 |         print(f"✅ Generated {len(result)} embeddings with dimension {self.vector_size}")
253 |         return result
254 |             
255 |     def embed_query(self, text: str) -> List[float]:
256 |         """Generate embedding for a single query text."""
257 |         try:
258 |             print(torch.cuda.device_count())
259 |             print(torch.cuda.get_device_name(0))
260 |             print(torch.cuda.get_device_name(1))
261 |         except Exception as e:
262 |             print(f"Error occurred while accessing GPU: {e}")
263 |         result = self.embed_texts([text], batch_size=1)
264 |         return result[0] if result else []
265 | 


--------------------------------------------------------------------------------
/process_improved_data.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Process improved data for EduPlan AI system.
  4 | This script loads the improved JSON data, generates embeddings using NV-Embed,
  5 | and stores them in the Qdrant vector database for efficient retrieval.
  6 | """
  7 | 
  8 | import sys
  9 | import os
 10 | import json
 11 | import time
 12 | from typing import List, Dict, Any, Tuple
 13 | from pathlib import Path
 14 | import logging
 15 | 
 16 | # Add parent directory to path to import modules
 17 | sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
 18 | 
 19 | # Import required modules
 20 | from src.models.embedding_model import NVEmbedPipeline
 21 | from src.database.qdrant_connector import QdrantConnector
 22 | from src.core.config import QDRANT_COLLECTION_NAME, QDRANT_HOST, QDRANT_PORT, QDRANT_VECTOR_SIZE
 23 | 
 24 | # Configure logging
 25 | logging.basicConfig(
 26 |     level=logging.INFO,
 27 |     format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
 28 | )
 29 | logger = logging.getLogger(__name__)
 30 | 
 31 | def load_improved_data(data_dir: str = "../../data/processed_improved") -> List[Dict[str, Any]]:
 32 |     """
 33 |     Load all improved data files from the specified directory.
 34 |     
 35 |     Args:
 36 |         data_dir: Directory containing improved JSON data files
 37 |     
 38 |     Returns:
 39 |         List of dictionaries containing the loaded data
 40 |     """
 41 |     # Resolve data path relative to this script file
 42 |     script_dir = Path(__file__).resolve().parent
 43 |     data_path = script_dir / data_dir
 44 |     logger.info(f"Loading improved data from {data_path}")
 45 |     
 46 |     all_data = []
 47 |     
 48 |     if not data_path.exists():
 49 |         logger.error(f"Data directory not found: {data_path}")
 50 |         return all_data
 51 |     
 52 |     # Find all JSON files with the new naming pattern (Chapter_X_Y.json)
 53 |     json_files = sorted(data_path.glob("Chapter_*.json"))
 54 |     
 55 |     if not json_files:
 56 |         logger.warning(f"No Chapter_*.json files found in {data_path}")
 57 |         return all_data
 58 |     
 59 |     logger.info(f"Found {len(json_files)} JSON files to load")
 60 |     
 61 |     for json_file in json_files:
 62 |         try:
 63 |             with open(json_file, 'r', encoding='utf-8') as f:
 64 |                 data = json.load(f)
 65 |                 logger.info(f"Loaded {json_file.name}: {len(data) if isinstance(data, list) else '1'} items")
 66 |                 all_data.append({
 67 |                     "file": json_file.name,
 68 |                     "data": data
 69 |                 })
 70 |         except Exception as e:
 71 |             logger.error(f"Error loading {json_file}: {e}")
 72 |     
 73 |     return all_data
 74 | 
 75 | def extract_chapter_info(filename: str) -> Tuple[str, str]:
 76 |     """
 77 |     Extract chapter number and chunk info from filename.
 78 |     
 79 |     Args:
 80 |         filename: Filename like "Chapter_1_1.json"
 81 |     
 82 |     Returns:
 83 |         Tuple of (chapter_number, chunk_number)
 84 |     """
 85 |     try:
 86 |         # Parse filename: Chapter_X_Y.json -> chapter=X, chunk=Y
 87 |         name_parts = filename.replace('.json', '').split('_')
 88 |         if len(name_parts) >= 3:
 89 |             chapter_num = name_parts[1]
 90 |             chunk_num = name_parts[2]
 91 |             return chapter_num, chunk_num
 92 |         else:
 93 |             # Fallback for unexpected format
 94 |             return name_parts[1] if len(name_parts) > 1 else "unknown", "1"
 95 |     except Exception as e:
 96 |         logger.warning(f"Could not parse filename {filename}: {e}")
 97 |         return "unknown", "1"
 98 | 
 99 | def extract_content_from_block(content_block: Dict[str, Any]) -> str:
100 |     """
101 |     Extract text content from a content block based on its type.
102 |     
103 |     Args:
104 |         content_block: Dictionary containing content block data
105 |     
106 |     Returns:
107 |         Extracted text content
108 |     """
109 |     block_type = content_block.get("type", "unknown")
110 |     content_parts = []
111 |     
112 |     if block_type == "text":
113 |         content = content_block.get("content", "")
114 |         if content:
115 |             content_parts.append(content)
116 |     
117 |     elif block_type == "summary":
118 |         title = content_block.get("title", "")
119 |         if title:
120 |             content_parts.append(f"Summary: {title}")
121 |         
122 |         summary_points = content_block.get("summary_points", [])
123 |         if summary_points:
124 |             content_parts.append("Key Points:")
125 |             for point in summary_points:
126 |                 content_parts.append(f"• {point}")
127 |     
128 |     elif block_type == "activity":
129 |         activity_num = content_block.get("activity_number", "")
130 |         title = content_block.get("title", "")
131 |         description = content_block.get("description", "")
132 |         questions = content_block.get("questions", [])
133 |         
134 |         # Build activity content
135 |         activity_parts = []
136 |         if activity_num:
137 |             activity_parts.append(f"Activity {activity_num}")
138 |         if title:
139 |             activity_parts.append(f"Title: {title}")
140 |         if description:
141 |             activity_parts.append(f"Description: {description}")
142 |         if questions:
143 |             activity_parts.append("Questions:")
144 |             for i, question in enumerate(questions, 1):
145 |                 activity_parts.append(f"{i}. {question}")
146 |         
147 |         if activity_parts:
148 |             content_parts.append("Activity:\n" + "\n".join(activity_parts))
149 |     
150 |     elif block_type == "questions":
151 |         title = content_block.get("title", "")
152 |         questions = content_block.get("questions", [])
153 |         
154 |         if title:
155 |             content_parts.append(f"Questions Section: {title}")
156 |         
157 |         if questions:
158 |             content_parts.append("Questions:")
159 |             for i, question in enumerate(questions, 1):
160 |                 content_parts.append(f"{i}. {question}")
161 |     
162 |     return "\n".join(content_parts)
163 | 
164 | def prepare_documents(improved_data: List[Dict[str, Any]]) -> Tuple[List[str], List[Dict[str, Any]]]:
165 |     """
166 |     Extract text and metadata from improved data with the new chunked format.
167 |     
168 |     Args:
169 |         improved_data: List of dictionaries containing improved data
170 |     
171 |     Returns:
172 |         Tuple containing (text_chunks, metadata)
173 |     """
174 |     texts = []
175 |     metadata = []
176 |     
177 |     for file_data in improved_data:
178 |         filename = file_data["file"]
179 |         data = file_data["data"]
180 |         
181 |         # Extract chapter and chunk info from new filename format
182 |         chapter_num, chunk_num = extract_chapter_info(filename)
183 |         
184 |         logger.info(f"Processing {filename} - Chapter {chapter_num}, Chunk {chunk_num}")
185 |         
186 |         # Handle the new format: data is a list with chapter objects
187 |         if isinstance(data, list) and len(data) > 0:
188 |             for chapter_idx, chapter_data in enumerate(data):
189 |                 if not isinstance(chapter_data, dict):
190 |                     continue
191 |                 
192 |                 chapter_number = chapter_data.get("chapter_number", chapter_num)
193 |                 chapter_title = chapter_data.get("chapter_title", f"Chapter {chapter_num}")
194 |                 
195 |                 # Process all sections in this chapter
196 |                 sections = chapter_data.get("sections", [])
197 |                 
198 |                 for section_idx, section in enumerate(sections):
199 |                     section_number = section.get("section_number", "")
200 |                     section_title = section.get("section_title", "")
201 |                     
202 |                     # Process content blocks in main section
203 |                     content_blocks = section.get("content_blocks", [])
204 |                     
205 |                     for block_idx, content_block in enumerate(content_blocks):
206 |                         if not isinstance(content_block, dict):
207 |                             continue
208 |                         
209 |                         # Extract text content from this block
210 |                         text_content = extract_content_from_block(content_block)
211 |                         
212 |                         # Only add if we have substantial content
213 |                         if text_content and len(text_content.strip()) > 10:
214 |                             texts.append(text_content.strip())
215 |                             
216 |                             # Create metadata for this content block
217 |                             meta = {
218 |                                 "id": f"ch{chapter_num}_chunk{chunk_num}_s{section_idx}_b{block_idx}",
219 |                                 "chapter_number": chapter_number,
220 |                                 "chapter_title": chapter_title,
221 |                                 "section_number": section_number or f"Section {section_idx + 1}",
222 |                                 "section_title": section_title,
223 |                                 "content_type": content_block.get("type", "unknown"),
224 |                                 "source_file": filename,
225 |                                 "chunk_number": chunk_num,
226 |                                 "original_chapter": f"Chapter {chapter_num}",
227 |                                 "block_index": block_idx
228 |                             }
229 |                             metadata.append(meta)
230 |                     
231 |                     # Process sub-sections
232 |                     sub_sections = section.get("sub_sections", [])
233 |                     
234 |                     for sub_idx, sub_section in enumerate(sub_sections):
235 |                         sub_section_number = sub_section.get("section_number", "")
236 |                         sub_section_title = sub_section.get("section_title", "")
237 |                         
238 |                         # Process content blocks in sub-sections
239 |                         sub_content_blocks = sub_section.get("content_blocks", [])
240 |                         
241 |                         for sub_block_idx, content_block in enumerate(sub_content_blocks):
242 |                             if not isinstance(content_block, dict):
243 |                                 continue
244 |                             
245 |                             # Extract text content from this block
246 |                             text_content = extract_content_from_block(content_block)
247 |                             
248 |                             # Only add if we have substantial content
249 |                             if text_content and len(text_content.strip()) > 10:
250 |                                 texts.append(text_content.strip())
251 |                                 
252 |                                 # Create metadata for this sub-section content block
253 |                                 meta = {
254 |                                     "id": f"ch{chapter_num}_chunk{chunk_num}_s{section_idx}_sub{sub_idx}_b{sub_block_idx}",
255 |                                     "chapter_number": chapter_number,
256 |                                     "chapter_title": chapter_title,
257 |                                     "section_number": section_number or f"Section {section_idx + 1}",
258 |                                     "section_title": section_title,
259 |                                     "sub_section_number": sub_section_number,
260 |                                     "sub_section_title": sub_section_title,
261 |                                     "content_type": content_block.get("type", "unknown"),
262 |                                     "source_file": filename,
263 |                                     "chunk_number": chunk_num,
264 |                                     "original_chapter": f"Chapter {chapter_num}",
265 |                                     "block_index": sub_block_idx,
266 |                                     "is_sub_section": True
267 |                                 }
268 |                                 metadata.append(meta)
269 |         
270 |         else:
271 |             logger.warning(f"Unexpected data format for file: {filename}")
272 |     
273 |     logger.info(f"Prepared {len(texts)} documents with metadata")
274 |     return texts, metadata
275 | 
276 | def generate_embeddings(texts: List[str]) -> List[List[float]]:
277 |     """
278 |     Generate embeddings for text chunks using NV-Embed.
279 |     
280 |     Args:
281 |         texts: List of text chunks to embed
282 |     
283 |     Returns:
284 |         List of embedding vectors
285 |     """
286 |     logger.info("Initializing NV-Embed model...")
287 |     embedding_model = NVEmbedPipeline()
288 |     
289 |     logger.info(f"Generating embeddings for {len(texts)} documents...")
290 |     start_time = time.time()
291 |     embeddings = embedding_model.embed_texts(texts)
292 |     elapsed = time.time() - start_time
293 |     
294 |     logger.info(f"Generated {len(embeddings)} embeddings in {elapsed:.2f} seconds")
295 |     
296 |     # Debug: Check the format of embeddings
297 |     if embeddings and len(embeddings) > 0:
298 |         first_emb = embeddings[0]
299 |         logger.info(f"First embedding type: {type(first_emb)}")
300 |         logger.info(f"First embedding length: {len(first_emb)}")
301 |         if hasattr(first_emb, 'tolist') and callable(getattr(first_emb, 'tolist')):
302 |             logger.info("Converting embeddings from numpy/tensor to list format")
303 |             embeddings = [emb.tolist() for emb in embeddings]
304 |     
305 |     return embeddings
306 | 
307 | def store_in_database(texts, embeddings, metadata, collection_name="science_9_collection"):
308 |     """
309 |     Store documents and embeddings in Qdrant.
310 |     
311 |     Args:
312 |         texts: List of text chunks
313 |         embeddings: List of embedding vectors
314 |         metadata: List of metadata dictionaries
315 |         collection_name: Name of the Qdrant collection
316 |     """
317 |     logger.info(f"Storing data in collection: {collection_name}")
318 | 
319 |     # Always use 2560 for QWEN-4B size
320 |     vector_size = 2560
321 |     logger.info(f"Using vector size: {vector_size}")
322 |     
323 |     # Create Qdrant connector
324 |     qdrant = QdrantConnector(
325 |         host=QDRANT_HOST,
326 |         port=QDRANT_PORT,
327 |         collection_name=collection_name,
328 |         vector_size=vector_size
329 |     )
330 |     
331 |     # Create collection if it doesn't exist
332 |     qdrant.recreate_collection()
333 |     
334 |     # Prepare documents for insertion
335 |     documents = []
336 |     for i, (text, meta) in enumerate(zip(texts, metadata)):
337 |         # Use numeric ID (required by Qdrant) but save original ID in metadata
338 |         original_id = meta.get("id", f"doc_{i}")
339 |         meta["original_id"] = original_id  # Keep the original ID in metadata
340 |         
341 |         doc = {
342 |             "id": i,  # Use simple numeric ID for Qdrant
343 |             "text": text,
344 |             "metadata": meta
345 |         }
346 |         documents.append(doc)
347 |     
348 |     # Insert documents
349 |     success = qdrant.insert_documents(documents, embeddings)
350 |     
351 |     if success:
352 |         logger.info(f"Successfully stored {len(documents)} documents in database")
353 |     else:
354 |         logger.error("Failed to store documents in database")
355 |         
356 |     return success
357 | 
358 | def main():
359 |     """Main processing function"""
360 |     logger.info("Starting improved data processing with NV-Embed")
361 |     
362 |     # Load improved data
363 |     improved_data = load_improved_data()
364 |     if not improved_data:
365 |         logger.error("No improved data found. Exiting.")
366 |         return
367 |     
368 |     # Prepare documents
369 |     texts, metadata = prepare_documents(improved_data)
370 |     if not texts:
371 |         logger.error("No text chunks extracted. Exiting.")
372 |         return
373 |     
374 |     # Generate embeddings
375 |     embeddings = generate_embeddings(texts)
376 |     if not embeddings or len(embeddings) != len(texts):
377 |         logger.error(f"Embedding generation failed. Got {len(embeddings)} embeddings for {len(texts)} texts.")
378 |         return
379 |     
380 |     # Store in database
381 |     success = store_in_database(texts, embeddings, metadata)
382 |     
383 |     if success:
384 |         logger.info(f"✅ Processing completed successfully!")
385 |         logger.info(f"   Processed {len(texts)} documents across {len(improved_data)} files")
386 |         logger.info(f"   Files processed: {[data['file'] for data in improved_data]}")
387 |     else:
388 |         logger.error("❌ Processing failed.")
389 | 
390 | if __name__ == "__main__":
391 |     main()


--------------------------------------------------------------------------------
/lesson_plan_generator.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Simplified MCQ Generator for EduPlan AI
  4 | Generates exactly 5 MCQs for any given topic and saves them in Output_Lesson_Plans
  5 | """
  6 | 
  7 | import os
  8 | import logging
  9 | import json
 10 | from typing import List, Dict, Any, Optional
 11 | from datetime import datetime
 12 | from pathlib import Path
 13 | from langchain_qdrant import Qdrant
 14 | from langchain_core.prompts import PromptTemplate
 15 | from langchain_core.output_parsers import PydanticOutputParser
 16 | from pydantic import BaseModel, Field
 17 | from langchain_openai import ChatOpenAI
 18 | from langchain_community.embeddings import HuggingFaceEmbeddings
 19 | from qdrant_client import QdrantClient, models
 20 | 
 21 | # Local imports - adjust these paths according to your project structure
 22 | from qdrant_connector import QdrantConnector
 23 | from config import QDRANT_HOST, QDRANT_PORT, QDRANT_COLLECTION_NAME, QDRANT_VECTOR_SIZE
 24 | 
 25 | logging.basicConfig(level=logging.INFO)
 26 | logger = logging.getLogger(__name__)
 27 | 
 28 | class MCQQuestion(BaseModel):
 29 |     question: str
 30 |     option_a: str
 31 |     option_b: str
 32 |     option_c: str
 33 |     option_d: str
 34 |     correct_answer: str
 35 |     explanation: str
 36 | 
 37 | class MCQSet(BaseModel):
 38 |     mcqs: List[MCQQuestion]
 39 | 
 40 | class MCQGenerator:
 41 |     def __init__(self, openai_api_key: Optional[str] = None):
 42 |         self.openai_api_key = openai_api_key or os.getenv("OPENAI_API_KEY")
 43 |         if not self.openai_api_key:
 44 |             raise ValueError("❌ OpenAI API key not found. Set OPENAI_API_KEY environment variable.")
 45 | 
 46 |         self.qdrant_connector = QdrantConnector(
 47 |             host=QDRANT_HOST,
 48 |             port=QDRANT_PORT,
 49 |             collection_name=QDRANT_COLLECTION_NAME,
 50 |             vector_size=QDRANT_VECTOR_SIZE
 51 |         )
 52 |         self.embeddings = HuggingFaceEmbeddings(
 53 |             model_name="Qwen/Qwen3-Embedding-4B",
 54 |             model_kwargs={'device': 'cpu'},
 55 |             encode_kwargs={'normalize_embeddings': True}
 56 |         )
 57 |         self.vector_store = self._initialize_vector_store()
 58 |         self.llm = ChatOpenAI(
 59 |             model="gpt-4o",  # <-- FIXED MODEL NAME
 60 |             temperature=0.3,
 61 |             openai_api_key=self.openai_api_key
 62 |         )
 63 |         self.output_parser = PydanticOutputParser(pydantic_object=MCQSet)
 64 |         logger.info("✅ MCQ Generator initialized successfully")
 65 | 
 66 |     def _initialize_vector_store(self):
 67 |         try:
 68 |             vector_store = Qdrant(
 69 |                 client=self.qdrant_connector.client,
 70 |                 collection_name=QDRANT_COLLECTION_NAME,
 71 |                 embeddings=self.embeddings
 72 |             )
 73 |             logger.info("✅ Connected to Qdrant vector store")
 74 |             return vector_store
 75 |         except Exception as e:
 76 |             logger.error(f"❌ Failed to initialize vector store: {e}")
 77 |             return None
 78 | 
 79 |     def retrieve_relevant_content(self, topic: str, chapter: str = None, section: str = None, 
 80 |                                 content_type: str = None, top_k: int = 8) -> List[Any]:
 81 |         search_query = topic
 82 |         logger.info(f"🔍 Searching for topic: '{search_query}'")
 83 |         query_embedding = self.embeddings.embed_query(search_query)
 84 |         must_conditions = []
 85 |         if chapter:
 86 |             must_conditions.append(models.FieldCondition(key="chapter_number", match=models.MatchValue(value=str(chapter))))
 87 |         if section:
 88 |             must_conditions.append(models.FieldCondition(key="section_number", match=models.MatchValue(value=str(section))))
 89 |         if content_type:
 90 |             must_conditions.append(models.FieldCondition(key="content_type", match=models.MatchValue(value=content_type)))
 91 |         query_filter = models.Filter(must=must_conditions) if must_conditions else None
 92 | 
 93 |         try:
 94 |             client = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT)
 95 |             if query_filter:
 96 |                 logger.info(f"🎯 Applying filters: {must_conditions}")
 97 |                 results = client.query_points(
 98 |                     collection_name=QDRANT_COLLECTION_NAME,
 99 |                     query=query_embedding,
100 |                     query_filter=query_filter,
101 |                     limit=top_k,
102 |                     search_params=models.SearchParams(hnsw_ef=128, exact=False)
103 |                 )
104 |             else:
105 |                 logger.info("🌐 Searching all documents without filters")
106 |                 results = client.query_points(
107 |                     collection_name=QDRANT_COLLECTION_NAME,
108 |                     query=query_embedding,
109 |                     limit=top_k,
110 |                     search_params=models.SearchParams(hnsw_ef=128, exact=False)
111 |                 )
112 |             logger.info(f"✅ Found {len(results.points)} relevant documents via Query API")
113 |             docs = []
114 |             for point in results.points:
115 |                 page_content = (point.payload.get("text", "") or 
116 |                               point.payload.get("content", "") or
117 |                               point.payload.get("chunk_text", "") or
118 |                               str(point.payload))
119 |                 metadata = point.payload.copy()
120 |                 doc = type('Doc', (), {})()
121 |                 doc.page_content = page_content
122 |                 doc.metadata = metadata
123 |                 doc.score = getattr(point, 'score', None)
124 |                 docs.append(doc)
125 |             if docs:
126 |                 logger.info(f"📄 Sample doc metadata: {docs[0].metadata}")
127 |                 logger.info(f"📝 Sample content length: {len(docs[0].page_content)} chars")
128 |             return docs
129 |         except Exception as e:
130 |             logger.error(f"❌ Error retrieving content via Query API: {e}")
131 |             logger.error(f"🔍 Query details - Collection: {QDRANT_COLLECTION_NAME}, Filter: {query_filter}")
132 |             return []
133 | 
134 |     def generate_mcqs(self, topic: str, chapter: str = None, section: str = None, 
135 |                      content_type: str = None) -> Dict[str, Any]:
136 |         relevant_docs = self.retrieve_relevant_content(topic, chapter, section, content_type)
137 |         if not relevant_docs:
138 |             logger.warning("⚠️ No documents found. Trying without filters...")
139 |             relevant_docs = self.retrieve_relevant_content(topic)
140 |         if not relevant_docs:
141 |             return {
142 |                 "error": "No relevant content found in the knowledge base",
143 |                 "topic": topic,
144 |                 "suggestions": [
145 |                     "Check if Qdrant contains data",
146 |                     "Try a different topic",
147 |                     "Check if embedding model matches the one used for indexing"
148 |                 ]
149 |             }
150 |         context = self._extract_context_from_docs(relevant_docs)
151 |         try:
152 |             mcq_set = self._generate_mcqs_with_openai(topic, context)
153 |             return {
154 |                 "mcqs": [
155 |                     {
156 |                         "question": mcq.question,
157 |                         "options": {
158 |                             "A": mcq.option_a,
159 |                             "B": mcq.option_b,
160 |                             "C": mcq.option_c,
161 |                             "D": mcq.option_d
162 |                         },
163 |                         "correct_answer": mcq.correct_answer,
164 |                         "explanation": mcq.explanation
165 |                     }
166 |                     for mcq in mcq_set.mcqs
167 |                 ],
168 |                 "topic": topic,
169 |                 "chapter": chapter,
170 |                 "section": section,
171 |                 "generated_at": datetime.now().isoformat(),
172 |                 "source_count": len(relevant_docs),
173 |                 "sources_preview": [
174 |                     {
175 |                         "chapter": doc.metadata.get('chapter_title', 'Unknown'),
176 |                         "section": doc.metadata.get('section_title', 'Unknown'),
177 |                         "content_type": doc.metadata.get('content_type', 'Unknown'),
178 |                         "preview": doc.page_content[:100] + "..." if len(doc.page_content) > 100 else doc.page_content
179 |                     }
180 |                     for doc in relevant_docs[:3]
181 |                 ]
182 |             }
183 |         except Exception as e:
184 |             logger.error(f"❌ Error generating MCQs: {e}")
185 |             return {
186 |                 "error": f"Failed to generate MCQs: {str(e)}",
187 |                 "topic": topic,
188 |                 "context_found": len(relevant_docs) > 0,
189 |                 "context_preview": relevant_docs[0].page_content[:200] if relevant_docs else "No context"
190 |             }
191 | 
192 |     def _extract_context_from_docs(self, docs: List[Any]) -> str:
193 |         context_parts = []
194 |         for i, doc in enumerate(docs):
195 |             content = doc.page_content.strip()
196 |             metadata = doc.metadata
197 |             context_part = f"""
198 | Document {i+1}:
199 | Chapter: {metadata.get('chapter_title', 'Unknown')}
200 | Section: {metadata.get('section_title', 'Unknown')}
201 | Content: {content}
202 | """
203 |             context_parts.append(context_part)
204 |         return "\n".join(context_parts)
205 | 
206 |     def _generate_mcqs_with_openai(self, topic: str, context: str) -> MCQSet:
207 |         prompt_template = """
208 | You are an expert educator and assessment designer. Create exactly 5 multiple choice questions (MCQs) based on the provided curriculum content.
209 | 
210 | CURRICULUM CONTENT:
211 | {context}
212 | 
213 | REQUIREMENTS:
214 | - Topic: {topic}
215 | - Generate exactly 5 MCQs based on the provided content
216 | - Each question should have 4 options (A, B, C, D)
217 | - Questions should test different levels of understanding (knowledge, comprehension, application, analysis)
218 | - Questions should be clear, unambiguous, and educational
219 | - Provide brief explanations for correct answers
220 | - Ensure questions are directly based on the provided curriculum content
221 | - Avoid trick questions or overly complex language
222 | 
223 | QUESTION DIFFICULTY LEVELS:
224 | 1. Basic knowledge/recall
225 | 2. Comprehension/understanding  
226 | 3. Application of concepts
227 | 4. Analysis/evaluation
228 | 5. Synthesis/problem-solving
229 | 
230 | {format_instructions}
231 | """
232 |         prompt = PromptTemplate(
233 |             template=prompt_template,
234 |             input_variables=["context", "topic"],
235 |             partial_variables={"format_instructions": self.output_parser.get_format_instructions()}
236 |         )
237 |         chain = prompt | self.llm | self.output_parser
238 |         result = chain.invoke({
239 |             "context": context,
240 |             "topic": topic
241 |         })
242 |         return result
243 | 
244 |     def save_mcqs(self, mcqs_data: Dict[str, Any], output_dir: str = "Output_Lesson_Plans"):
245 |         safe_topic = "_".join(mcqs_data['topic'].lower().split())
246 |         out_path = Path(output_dir) / f"MCQs_{safe_topic}.json"
247 |         with open(out_path, "w", encoding="utf-8") as f:
248 |             json.dump(mcqs_data, f, indent=2, ensure_ascii=False)
249 |         logger.info(f"Saved MCQs to {out_path}")
250 | 
251 |     def format_mcqs_for_display(self, mcqs_data: Dict[str, Any]) -> str:
252 |         if "error" in mcqs_data:
253 |             error_output = f"❌ Error: {mcqs_data['error']}\n"
254 |             if "context_preview" in mcqs_data:
255 |                 error_output += f"📄 Context found: {mcqs_data.get('context_found', False)}\n"
256 |                 error_output += f"🔍 Preview: {mcqs_data['context_preview']}\n"
257 |             return error_output
258 |         output = f"\n# MCQs: {mcqs_data['topic']}\n"
259 |         if mcqs_data.get('chapter'):
260 |             output += f"**Chapter:** {mcqs_data['chapter']} | "
261 |         if mcqs_data.get('section'):
262 |             output += f"**Section:** {mcqs_data['section']} | "
263 |         output += f"""**Generated:** {mcqs_data['generated_at'][:19]} | **Sources:** {mcqs_data['source_count']} documents\n\n## 📚 Source Materials Used:\n"""
264 |         for i, source in enumerate(mcqs_data.get('sources_preview', []), 1):
265 |             output += f"{i}. **{source['chapter']}** - {source['section']} ({source['content_type']})\n"
266 |             output += f"   Preview: {source['preview']}\n\n"
267 |         output += "---\n"
268 |         for i, mcq in enumerate(mcqs_data['mcqs'], 1):
269 |             output += f"""
270 | **Question {i}:** {mcq['question']}
271 | 
272 | A) {mcq['options']['A']}
273 | B) {mcq['options']['B']}  
274 | C) {mcq['options']['C']}
275 | D) {mcq['options']['D']}
276 | 
277 | **Answer:** {mcq['correct_answer']}
278 | **Explanation:** {mcq['explanation']}
279 | 
280 | ---
281 | """
282 |         return output.strip()
283 | 
284 | def main():
285 |     try:
286 |         from dotenv import load_dotenv
287 |         dotenv_path = Path(__file__).parent / '.env'
288 |         if dotenv_path.exists():
289 |             load_dotenv(dotenv_path)
290 |             print(f"🔑 Loaded environment variables from {dotenv_path}")
291 |         else:
292 |             load_dotenv()
293 |             print("🔑 Loaded environment variables from default .env location")
294 |     except ImportError:
295 |         print("⚠️ python-dotenv not installed. Environment variables may not be loaded from .env.")    
296 |     import argparse
297 |     parser = argparse.ArgumentParser(description="Generate 5 MCQs for a topic using OpenAI and save to Output_Lesson_Plans.")
298 |     parser.add_argument("--topic", type=str, required=True, help="Topic for MCQ generation")
299 |     parser.add_argument("--chapter", type=str, help="Chapter number (optional)")
300 |     parser.add_argument("--section", type=str, help="Section number (optional)")
301 |     parser.add_argument("--content_type", type=str, help="Content type filter (optional)")
302 |     args = parser.parse_args()
303 | 
304 |     generator = MCQGenerator()
305 |     result = generator.generate_mcqs(
306 |         topic=args.topic,
307 |         chapter=args.chapter,
308 |         section=args.section,
309 |         content_type=args.content_type
310 |     )
311 |     generator.save_mcqs(result)
312 |     print(generator.format_mcqs_for_display(result))
313 | 
314 | if __name__ == "__main__":
315 |     main()
316 | # #!/usr/bin/env python3
317 | # """
318 | # LangChain-based Lesson Plan Generator for EduPlan AI
319 | # Uses RAG (Retrieval-Augmented Generation) with Qdrant vector database
320 | # """
321 | 
322 | # import os
323 | # import logging
324 | # from typing import List, Dict, Any, Optional
325 | # from datetime import datetime
326 | # from langchain_qdrant import Qdrant
327 | # # LangChain imports
328 | # from langchain_core.prompts import PromptTemplate
329 | # from langchain_core.output_parsers import PydanticOutputParser
330 | # from pydantic import BaseModel, Field
331 | # from langchain_openai import ChatOpenAI
332 | # from langchain_community.embeddings import HuggingFaceEmbeddings
333 | # from langchain.chains import RetrievalQA
334 | # from langchain.chains.question_answering import load_qa_chain
335 | 
336 | # # Local imports
337 | # from ..database.qdrant_connector import QdrantConnector
338 | # from ..config import QDRANT_HOST, QDRANT_PORT, QDRANT_COLLECTION_NAME, QDRANT_VECTOR_SIZE
339 | 
340 | # # Configure logging
341 | # logging.basicConfig(level=logging.INFO)
342 | # logger = logging.getLogger(__name__)
343 | 
344 | 
345 | # class LessonPlanStructure(BaseModel):
346 | #     """Structured lesson plan output"""
347 | #     title: str = Field(description="Lesson title")
348 | #     subject: str = Field(description="Subject area")
349 | #     grade_level: str = Field(description="Target grade level")
350 | #     duration: str = Field(description="Lesson duration")
351 | #     objectives: List[str] = Field(description="Learning objectives")
352 | #     materials: List[str] = Field(description="Required materials")
353 | #     introduction: str = Field(description="Introduction activity")
354 | #     main_activities: List[str] = Field(description="Main teaching activities")
355 | #     assessment: str = Field(description="Assessment strategy")
356 | #     differentiation: str = Field(description="Differentiation strategies")
357 | #     standards: List[str] = Field(description="Curriculum standards alignment")
358 | 
359 | 
360 | # class LangChainLessonGenerator:
361 | #     """LangChain-powered lesson plan generator using Qdrant embeddings"""
362 | 
363 | #     def __init__(self, openai_api_key: Optional[str] = None):
364 | #         """Initialize the lesson generator with LangChain components"""
365 | 
366 | #         # Initialize OpenAI API key
367 | #         self.openai_api_key = openai_api_key or os.getenv("OPENAI_API_KEY")
368 | #         if not self.openai_api_key:
369 | #             logger.warning("⚠️ OpenAI API key not found. Set OPENAI_API_KEY environment variable.")
370 | 
371 | #         # Initialize Qdrant connection
372 | #         self.qdrant_connector = QdrantConnector(
373 | #             host=QDRANT_HOST,
374 | #             port=QDRANT_PORT,
375 | #             collection_name=QDRANT_COLLECTION_NAME,
376 | #             vector_size=QDRANT_VECTOR_SIZE
377 | #         )
378 |         
379 | 
380 | #         # Initialize embeddings (using same model as your embeddings)
381 | #         self.embeddings = HuggingFaceEmbeddings(
382 | #             model_name="Qwen/Qwen3-Embedding-4B",
383 | #             model_kwargs={'device': 'cpu'},  # Use CPU for retrieval
384 | #             encode_kwargs={'normalize_embeddings': True}
385 | #         )
386 | 
387 | #         # Initialize LangChain Qdrant vector store
388 | #         self.vector_store = self._initialize_vector_store()
389 | 
390 |         
391 | #         # Initialize LLM
392 | #         self.llm = ChatOpenAI(
393 | #             model="gpt-4o-mini",  # Cost-effective model
394 | #             temperature=0.3,
395 | #             openai_api_key=self.openai_api_key
396 | #         ) if self.openai_api_key else None
397 | 
398 | #         # Initialize output parser
399 | #         self.output_parser = PydanticOutputParser(pydantic_object=LessonPlanStructure)
400 | 
401 | #         logger.info("✅ LangChain Lesson Generator initialized")
402 | 
403 | #     def _initialize_vector_store(self):
404 | #         """Initialize LangChain Qdrant vector store"""
405 | #         try:
406 | #             vector_store = Qdrant(
407 | #                 client=self.qdrant_connector.client,
408 | #                 collection_name=QDRANT_COLLECTION_NAME,
409 | #                 embeddings=self.embeddings
410 | #             )
411 | #             logger.info("✅ Connected to Qdrant vector store")
412 | #             return vector_store
413 | #         except Exception as e:
414 | #             logger.error(f"❌ Failed to initialize vector store: {e}")
415 | #             return None
416 | 
417 | #     def retrieve_relevant_content(self, topic: str, grade_level: str = None, subject: str = None, top_k: int = 8) -> List[Any]:
418 | #         """Retrieve relevant educational content from Qdrant"""
419 | 
420 | #         # Build search query
421 | #         search_query = f"{topic}"
422 | #         if grade_level:
423 | #             search_query += f" grade {grade_level}"
424 | #         if subject:
425 | #             search_query += f" {subject}"
426 | 
427 | #         logger.info(f"🔍 Searching for: '{search_query}'")
428 | 
429 | #         try:
430 | #             # Perform similarity search
431 | #             docs = self.vector_store.similarity_search(
432 | #                 query=search_query,
433 | #                 k=top_k,
434 | #                 filter=None
435 | #             )
436 | 
437 | #             logger.info(f"✅ Found {len(docs)} relevant documents")
438 | #             return docs
439 | 
440 | #         except Exception as e:
441 | #             logger.error(f"❌ Error retrieving content: {e}")
442 | #             return []
443 | #         def retrieve_relevant_content(self, topic: str, grade_level: str = None, subject: str = None, chapter: str = None, section: str = None, top_k: int = 8) -> List[Any]:
444 | #             """Retrieve relevant educational content from Qdrant using Query API"""
445 | 
446 | #             from qdrant_client import QdrantClient, models
447 | #             from langchain_community.embeddings import HuggingFaceEmbeddings
448 | 
449 | #             # Build search query
450 | #             search_query = f"{topic}"
451 | #             if grade_level:
452 | #                 search_query += f" grade {grade_level}"
453 | #             if subject:
454 | #                 search_query += f" {subject}"
455 | #             if chapter:
456 | #                 search_query += f" chapter {chapter}"
457 | #             if section:
458 | #                 search_query += f" section {section}"
459 | 
460 | #             logger.info(f"🔍 Searching for: '{search_query}'")
461 | 
462 | #             # Get embedding for query
463 | #             query_embedding = self.embeddings.embed_query(search_query)
464 | 
465 | #             # Build advanced filter
466 | #             must_conditions = []
467 | #             if grade_level:
468 | #                 must_conditions.append(models.FieldCondition(key="grade_level", match=models.MatchValue(value=grade_level)))
469 | #             if subject:
470 | #                 must_conditions.append(models.FieldCondition(key="subject", match=models.MatchValue(value=subject)))
471 | #             if chapter:
472 | #                 must_conditions.append(models.FieldCondition(key="chapter_number", match=models.MatchValue(value=chapter)))
473 | #             if section:
474 | #                 must_conditions.append(models.FieldCondition(key="section_number", match=models.MatchValue(value=section)))
475 | 
476 | #             query_filter = models.Filter(must=must_conditions) if must_conditions else None
477 | 
478 | #             # Use QdrantClient Query API
479 | #             try:
480 | #                 client = QdrantClient(host=QDRANT_HOST, port=QDRANT_PORT)
481 | #                 results = client.query_points(
482 | #                     collection_name=QDRANT_COLLECTION_NAME,
483 | #                     query=query_embedding,
484 | #                     query_filter=query_filter,
485 | #                     limit=top_k,
486 | #                     search_params=models.SearchParams(hnsw_ef=128, exact=False)
487 | #                 )
488 | #                 logger.info(f"✅ Found {len(results)} relevant documents via Query API")
489 | #                 # Convert results to LangChain Document-like objects
490 | #                 docs = []
491 | #                 for point in results:
492 | #                     page_content = point.payload.get("text", "")
493 | #                     metadata = point.payload.copy()
494 | #                     doc = type('Doc', (), {})()
495 | #                     doc.page_content = page_content
496 | #                     doc.metadata = metadata
497 | #                     doc.score = getattr(point, 'score', None)
498 | #                     docs.append(doc)
499 | #                 return docs
500 | #             except Exception as e:
501 | #                 logger.error(f"❌ Error retrieving content via Query API: {e}")
502 | #                 return []
503 | 
504 | #     def _build_filter(self, grade_level: str = None, subject: str = None) -> Optional[Dict[str, Any]]:
505 | #         """Build Qdrant filter for search"""
506 | #         filter_conditions = []
507 | 
508 | #         if grade_level:
509 | #             filter_conditions.append({
510 | #                 "key": "metadata.grade_level",
511 | #                 "match": {"value": grade_level}
512 | #             })
513 | 
514 | #         if subject:
515 | #             filter_conditions.append({
516 | #                 "key": "metadata.subject",
517 | #                 "match": {"value": subject}
518 | #             })
519 | 
520 | #         if filter_conditions:
521 | #             return {"must": filter_conditions}
522 | #         return None
523 | #         # _build_filter is now handled by Query API above
524 | 
525 | #     def generate_lesson_plan(self, topic: str, grade_level: str = "9-12", subject: str = "General",
526 | #                            duration: str = "45 minutes", custom_requirements: str = "", chapter: str = None, section: str = None) -> Dict[str, Any]:
527 | #         """
528 | #         Generate a comprehensive lesson plan using RAG
529 | 
530 | #         Args:
531 | #             topic: Main lesson topic
532 | #             grade_level: Target grade level
533 | #             subject: Subject area
534 | #             duration: Lesson duration
535 | #             custom_requirements: Additional requirements or constraints
536 | #             chapter: Optional chapter number for filtering
537 | #             section: Optional section number for filtering
538 | #         """
539 | #         # Retrieve relevant content
540 | #         relevant_docs = self.retrieve_relevant_content(topic, grade_level, subject, chapter, section)
541 | 
542 | #         if not relevant_docs:
543 | #             return {
544 | #                 "error": "No relevant content found in the knowledge base",
545 | #                 "topic": topic,
546 | #                 "suggestions": ["Try a different topic", "Check grade level", "Verify subject area"]
547 | #             }
548 | 
549 | #         # Extract context from retrieved documents
550 | #         context = self._extract_context_from_docs(relevant_docs)
551 | 
552 | #         # Generate lesson plan using LangChain
553 | #         if self.llm:
554 | #             lesson_plan = self._generate_with_langchain(topic, context, grade_level, subject, duration, custom_requirements)
555 | #         else:
556 | #             lesson_plan = self._generate_fallback_lesson_plan(topic, context, grade_level, subject, duration)
557 | 
558 | #         return {
559 | #             "lesson_plan": lesson_plan,
560 | #             "sources": [
561 | #                 {
562 | #                     "content": doc.page_content[:200] + "...",
563 | #                     "metadata": doc.metadata,
564 | #                     "score": getattr(doc, 'score', None)
565 | #                 } for doc in relevant_docs
566 | #             ],
567 | #             "topic": topic,
568 | #             "grade_level": grade_level,
569 | #             "subject": subject,
570 | #             "generated_at": datetime.now().isoformat(),
571 | #             "source_count": len(relevant_docs)
572 | #         }
573 | 
574 | #         if not relevant_docs:
575 | #             return {
576 | #                 "error": "No relevant content found in the knowledge base",
577 | #                 "topic": topic,
578 | #                 "suggestions": ["Try a different topic", "Check grade level", "Verify subject area"]
579 | #             }
580 | 
581 | #         # Extract context from retrieved documents
582 | #         context = self._extract_context_from_docs(relevant_docs)
583 | 
584 | #         # Generate lesson plan using LangChain
585 | #         if self.llm:
586 | #             lesson_plan = self._generate_with_langchain(topic, context, grade_level, subject, duration, custom_requirements)
587 | #         else:
588 | #             lesson_plan = self._generate_fallback_lesson_plan(topic, context, grade_level, subject, duration)
589 | 
590 | #         return {
591 | #             "lesson_plan": lesson_plan,
592 | #             "sources": [
593 | #                 {
594 | #                     "content": doc.page_content[:200] + "...",
595 | #                     "metadata": doc.metadata,
596 | #                     "score": getattr(doc, 'score', None)
597 | #                 } for doc in relevant_docs
598 | #             ],
599 | #             "topic": topic,
600 | #             "grade_level": grade_level,
601 | #             "subject": subject,
602 | #             "generated_at": datetime.now().isoformat(),
603 | #             "source_count": len(relevant_docs)
604 | #         }
605 | 
606 | #     def _extract_context_from_docs(self, docs: List[Any]) -> str:
607 | #         """Extract and format context from retrieved documents"""
608 | #         context_parts = []
609 | 
610 | #         for i, doc in enumerate(docs):
611 | #             content = doc.page_content.strip()
612 | #             metadata = doc.metadata
613 | 
614 | #             # Format context with metadata
615 | #             context_part = f"""
616 | # Document {i+1}:
617 | # Chapter: {metadata.get('chapter_title', 'Unknown')}
618 | # Section: {metadata.get('section_title', 'Unknown')}
619 | # Content: {content}
620 | # """
621 | #             context_parts.append(context_part)
622 | 
623 | #         return "\n".join(context_parts)
624 | 
625 | #     def _generate_with_langchain(self, topic: str, context: str, grade_level: str,
626 | #                                subject: str, duration: str, custom_requirements: str) -> str:
627 | #         """Generate lesson plan using LangChain and LLM"""
628 | 
629 | #         # Create prompt template
630 | #         prompt_template = """
631 | # You are an expert educational curriculum designer. Create a comprehensive, standards-aligned lesson plan using the provided context.
632 | 
633 | # CONTEXT FROM CURRICULUM:
634 | # {context}
635 | 
636 | # LESSON REQUIREMENTS:
637 | # - Topic: {topic}
638 | # - Grade Level: {grade_level}
639 | # - Subject: {subject}
640 | # - Duration: {duration}
641 | # - Additional Requirements: {custom_requirements}
642 | 
643 | # Generate a detailed lesson plan that includes:
644 | # 1. Clear learning objectives
645 | # 2. Engaging introduction activity
646 | # 3. Main teaching activities with timing
647 | # 4. Hands-on practice activities
648 | # 5. Assessment strategies
649 | # 6. Differentiation for diverse learners
650 | # 7. Required materials
651 | # 8. Curriculum standards alignment
652 | # 9. Extension activities
653 | 
654 | # Format the lesson plan professionally with clear sections and actionable details.
655 | # Ensure the plan is age-appropriate for {grade_level} students and aligns with {subject} curriculum standards.
656 | 
657 | # {format_instructions}
658 | # """
659 | 
660 | #         prompt = PromptTemplate(
661 | #             template=prompt_template,
662 | #             input_variables=["context", "topic", "grade_level", "subject", "duration", "custom_requirements"],
663 | #             partial_variables={"format_instructions": self.output_parser.get_format_instructions()}
664 | #         )
665 | 
666 | #         try:
667 | #             # Create chain
668 | #             chain = prompt | self.llm | self.output_parser
669 | 
670 | #             # Generate lesson plan
671 | #             result = chain.invoke({
672 | #                 "context": context,
673 | #                 "topic": topic,
674 | #                 "grade_level": grade_level,
675 | #                 "subject": subject,
676 | #                 "duration": duration,
677 | #                 "custom_requirements": custom_requirements
678 | #             })
679 | 
680 | #             # Format as readable lesson plan
681 | #             return self._format_lesson_plan_output(result)
682 | 
683 | #         except Exception as e:
684 | #             logger.error(f"❌ Error generating lesson plan with LangChain: {e}")
685 | #             return self._generate_fallback_lesson_plan(topic, context, grade_level, subject, duration)
686 | 
687 | #     def _format_lesson_plan_output(self, structured_plan: LessonPlanStructure) -> str:
688 | #         """Format structured lesson plan into readable text"""
689 | 
690 | #         lesson_plan = f"""
691 | # # {structured_plan.title}
692 | 
693 | # ## 📚 Course Information
694 | # - **Subject:** {structured_plan.subject}
695 | # - **Grade Level:** {structured_plan.grade_level}
696 | # - **Duration:** {structured_plan.duration}
697 | 
698 | # ## 🎯 Learning Objectives
699 | # {chr(10).join(f"{i+1}. {obj}" for i, obj in enumerate(structured_plan.objectives))}
700 | 
701 | # ## 📋 Materials Required
702 | # {chr(10).join(f"- {material}" for material in structured_plan.materials)}
703 | 
704 | # ## 🚀 Introduction Activity ({structured_plan.duration})
705 | # {structured_plan.introduction}
706 | 
707 | # ## 📖 Main Teaching Activities
708 | # {chr(10).join(f"### Activity {i+1}{chr(10)}{activity}" for i, activity in enumerate(structured_plan.main_activities))}
709 | 
710 | # ## ✅ Assessment Strategy
711 | # {structured_plan.assessment}
712 | 
713 | # ## 🎭 Differentiation Strategies
714 | # {structured_plan.differentiation}
715 | 
716 | # ## 📏 Curriculum Standards Alignment
717 | # {chr(10).join(f"- {standard}" for standard in structured_plan.standards)}
718 | 
719 | # ## 🔄 Extension Activities
720 | # - Advanced practice problems
721 | # - Research projects on related topics
722 | # - Real-world application assignments
723 | # - Peer teaching opportunities
724 | 
725 | # ---
726 | # *Generated by EduPlan AI - LangChain RAG System*
727 | # *Based on curriculum embeddings from Qdrant vector database*
728 | # """
729 | 
730 | #         return lesson_plan.strip()
731 | 
732 | #     def _generate_fallback_lesson_plan(self, topic: str, context: str, grade_level: str,
733 | #                                      subject: str, duration: str) -> str:
734 | #         """Fallback lesson plan generation without LLM"""
735 | 
736 | #         lesson_plan = f"""
737 | # # Lesson Plan: {topic}
738 | 
739 | # ## 📚 Course Information
740 | # - **Subject:** {subject}
741 | # - **Grade Level:** {grade_level}
742 | # - **Duration:** {duration}
743 | 
744 | # ## 🎯 Learning Objectives
745 | # 1. Understand the fundamental concepts of {topic}
746 | # 2. Apply learned principles to solve related problems
747 | # 3. Demonstrate comprehension through structured activities
748 | # 4. Connect new knowledge to existing curriculum
749 | 
750 | # ## 📋 Materials Required
751 | # - Textbook and reference materials
752 | # - Whiteboard/markers or presentation tools
753 | # - Worksheets and practice exercises
754 | # - Assessment materials
755 | 
756 | # ## 🚀 Introduction Activity (10 minutes)
757 | # - Engage students with a relevant real-world example
758 | # - Connect to previous knowledge
759 | # - Present learning objectives
760 | # - Set expectations for the lesson
761 | 
762 | # ## 📖 Main Teaching Activities
763 | 
764 | # ### Direct Instruction (15 minutes)
765 | # - Present core concepts with clear examples
766 | # - Use visual aids and demonstrations
767 | # - Encourage student questions and participation
768 | 
769 | # ### Guided Practice (15 minutes)
770 | # - Work through examples together as a class
771 | # - Provide step-by-step guidance
772 | # - Address common misconceptions
773 | 
774 | # ### Independent Practice (10 minutes)
775 | # - Students work individually on practice problems
776 | # - Circulate to provide individual support
777 | # - Monitor understanding and progress
778 | 
779 | # ## ✅ Assessment Strategy
780 | # - Formative assessment through observation and questioning
781 | # - Exit ticket with key concept check
782 | # - Homework assignment for reinforcement
783 | 
784 | # ## 🎭 Differentiation Strategies
785 | # - Provide additional support for struggling students
786 | # - Offer extension activities for advanced learners
787 | # - Use flexible grouping based on readiness
788 | 
789 | # ## 📏 Curriculum Standards Alignment
790 | # - Aligned with {subject} curriculum standards
791 | # - Meets grade {grade_level} learning expectations
792 | # - Supports progression to next concepts
793 | 
794 | # ---
795 | # *Generated by EduPlan AI - Fallback Mode*
796 | # *Note: For enhanced lesson plans, configure OpenAI API key*
797 | # """
798 | 
799 | #         return lesson_plan.strip()
800 | 
801 | #     def search_similar_topics(self, topic: str, top_k: int = 5) -> List[Dict[str, Any]]:
802 | #         """Search for similar topics in the knowledge base"""
803 | #         try:
804 | #             docs = self.vector_store.similarity_search(topic, k=top_k)
805 | #             return [
806 | #                 {
807 | #                     "topic": doc.metadata.get("section_title", "Unknown"),
808 | #                     "chapter": doc.metadata.get("chapter_title", "Unknown"),
809 | #                     "content_preview": doc.page_content[:150] + "...",
810 | #                     "similarity_score": getattr(doc, 'score', None)
811 | #                 } for doc in docs
812 | #             ]
813 | #         except Exception as e:
814 | #             logger.error(f"❌ Error searching similar topics: {e}")
815 | #             return []
816 | 
817 | 
818 | # def main():
819 | #     """Example usage of the LangChain lesson generator"""
820 | 
821 | #     # Initialize generator
822 | #     generator = LangChainLessonGenerator()
823 | 
824 | #     # Example lesson generation
825 | #     topic = "Introduction to Atomic Theory"
826 | #     grade_level = "9-10"
827 | #     subject = "Chemistry"
828 | 
829 | #     print(f"🎓 Generating lesson plan for: {topic}")
830 | #     print(f"📚 Grade Level: {grade_level} | Subject: {subject}")
831 | #     print("-" * 60)
832 | 
833 | #     result = generator.generate_lesson_plan(
834 | #         topic=topic,
835 | #         grade_level=grade_level,
836 | #         subject=subject,
837 | #         duration="50 minutes"
838 | #     )
839 | 
840 | #     if "error" in result:
841 | #         print(f"❌ Error: {result['error']}")
842 | #         print("💡 Suggestions:")
843 | #         for suggestion in result.get("suggestions", []):
844 | #             print(f"   - {suggestion}")
845 | #     else:
846 | #         print(result["lesson_plan"])
847 | #         print(f"\n📊 Sources used: {result['source_count']} documents")
848 | #         print(f"⏰ Generated at: {result['generated_at']}")
849 | 
850 | 
851 | # if __name__ == "__main__":
852 | #     main()
853 | 


--------------------------------------------------------------------------------