129 |
130 | `;
131 | }
132 |
133 | messageDiv.innerHTML = html;
134 | chatMessages.appendChild(messageDiv);
135 | chatMessages.scrollTop = chatMessages.scrollHeight;
136 |
137 | return messageId;
138 | }
139 |
140 | // Helper function to escape HTML for user messages
141 | function escapeHtml(text) {
142 | const div = document.createElement('div');
143 | div.textContent = text;
144 | return div.innerHTML;
145 | }
146 |
147 | // Removed removeMessage function - no longer needed since we handle loading differently
148 |
149 | async function createNewSession() {
150 | currentSessionId = null;
151 | chatMessages.innerHTML = '';
152 | addMessage('Welcome to the Course Materials Assistant! I can help you with questions about courses, lessons and specific content. What would you like to know?', 'assistant', null, true);
153 | }
154 |
155 | // Load course statistics
156 | async function loadCourseStats() {
157 | try {
158 | console.log('Loading course stats...');
159 | const response = await fetch(`${API_URL}/courses`);
160 | if (!response.ok) throw new Error('Failed to load course stats');
161 |
162 | const data = await response.json();
163 | console.log('Course data received:', data);
164 |
165 | // Update stats in UI
166 | if (totalCourses) {
167 | totalCourses.textContent = data.total_courses;
168 | }
169 |
170 | // Update course titles
171 | if (courseTitles) {
172 | if (data.course_titles && data.course_titles.length > 0) {
173 | courseTitles.innerHTML = data.course_titles
174 | .map(title => `
${title}
`)
175 | .join('');
176 | } else {
177 | courseTitles.innerHTML = 'No courses available';
178 | }
179 | }
180 |
181 | } catch (error) {
182 | console.error('Error loading course stats:', error);
183 | // Set default values on error
184 | if (totalCourses) {
185 | totalCourses.textContent = '0';
186 | }
187 | if (courseTitles) {
188 | courseTitles.innerHTML = 'Failed to load courses';
189 | }
190 | }
191 | }
--------------------------------------------------------------------------------
/backend/document_processor.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 | from typing import List, Tuple
4 | from models import Course, Lesson, CourseChunk
5 |
6 | class DocumentProcessor:
7 | """Processes course documents and extracts structured information"""
8 |
9 | def __init__(self, chunk_size: int, chunk_overlap: int):
10 | self.chunk_size = chunk_size
11 | self.chunk_overlap = chunk_overlap
12 |
13 | def read_file(self, file_path: str) -> str:
14 | """Read content from file with UTF-8 encoding"""
15 | try:
16 | with open(file_path, 'r', encoding='utf-8') as file:
17 | return file.read()
18 | except UnicodeDecodeError:
19 | # If UTF-8 fails, try with error handling
20 | with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
21 | return file.read()
22 |
23 |
24 |
25 | def chunk_text(self, text: str) -> List[str]:
26 | """Split text into sentence-based chunks with overlap using config settings"""
27 |
28 | # Clean up the text
29 | text = re.sub(r'\s+', ' ', text.strip()) # Normalize whitespace
30 |
31 | # Better sentence splitting that handles abbreviations
32 | # This regex looks for periods followed by whitespace and capital letters
33 | # but ignores common abbreviations
34 | sentence_endings = re.compile(r'(? self.chunk_size and current_chunk:
57 | break
58 |
59 | current_chunk.append(sentence)
60 | current_size += total_addition
61 |
62 | # Add chunk if we have content
63 | if current_chunk:
64 | chunks.append(' '.join(current_chunk))
65 |
66 | # Calculate overlap for next chunk
67 | if hasattr(self, 'chunk_overlap') and self.chunk_overlap > 0:
68 | # Find how many sentences to overlap
69 | overlap_size = 0
70 | overlap_sentences = 0
71 |
72 | # Count backwards from end of current chunk
73 | for k in range(len(current_chunk) - 1, -1, -1):
74 | sentence_len = len(current_chunk[k]) + (1 if k < len(current_chunk) - 1 else 0)
75 | if overlap_size + sentence_len <= self.chunk_overlap:
76 | overlap_size += sentence_len
77 | overlap_sentences += 1
78 | else:
79 | break
80 |
81 | # Move start position considering overlap
82 | next_start = i + len(current_chunk) - overlap_sentences
83 | i = max(next_start, i + 1) # Ensure we make progress
84 | else:
85 | # No overlap - move to next sentence after current chunk
86 | i += len(current_chunk)
87 | else:
88 | # No sentences fit, move to next
89 | i += 1
90 |
91 | return chunks
92 |
93 |
94 |
95 |
96 |
97 | def process_course_document(self, file_path: str) -> Tuple[Course, List[CourseChunk]]:
98 | """
99 | Process a course document with expected format:
100 | Line 1: Course Title: [title]
101 | Line 2: Course Link: [url]
102 | Line 3: Course Instructor: [instructor]
103 | Following lines: Lesson markers and content
104 | """
105 | content = self.read_file(file_path)
106 | filename = os.path.basename(file_path)
107 |
108 | lines = content.strip().split('\n')
109 |
110 | # Extract course metadata from first three lines
111 | course_title = filename # Default fallback
112 | course_link = None
113 | instructor_name = "Unknown"
114 |
115 | # Parse course title from first line
116 | if len(lines) >= 1 and lines[0].strip():
117 | title_match = re.match(r'^Course Title:\s*(.+)$', lines[0].strip(), re.IGNORECASE)
118 | if title_match:
119 | course_title = title_match.group(1).strip()
120 | else:
121 | course_title = lines[0].strip()
122 |
123 | # Parse remaining lines for course metadata
124 | for i in range(1, min(len(lines), 4)): # Check first 4 lines for metadata
125 | line = lines[i].strip()
126 | if not line:
127 | continue
128 |
129 | # Try to match course link
130 | link_match = re.match(r'^Course Link:\s*(.+)$', line, re.IGNORECASE)
131 | if link_match:
132 | course_link = link_match.group(1).strip()
133 | continue
134 |
135 | # Try to match instructor
136 | instructor_match = re.match(r'^Course Instructor:\s*(.+)$', line, re.IGNORECASE)
137 | if instructor_match:
138 | instructor_name = instructor_match.group(1).strip()
139 | continue
140 |
141 | # Create course object with title as ID
142 | course = Course(
143 | title=course_title,
144 | course_link=course_link,
145 | instructor=instructor_name if instructor_name != "Unknown" else None
146 | )
147 |
148 | # Process lessons and create chunks
149 | course_chunks = []
150 | current_lesson = None
151 | lesson_title = None
152 | lesson_link = None
153 | lesson_content = []
154 | chunk_counter = 0
155 |
156 | # Start processing from line 4 (after metadata)
157 | start_index = 3
158 | if len(lines) > 3 and not lines[3].strip():
159 | start_index = 4 # Skip empty line after instructor
160 |
161 | i = start_index
162 | while i < len(lines):
163 | line = lines[i]
164 |
165 | # Check for lesson markers (e.g., "Lesson 0: Introduction")
166 | lesson_match = re.match(r'^Lesson\s+(\d+):\s*(.+)$', line.strip(), re.IGNORECASE)
167 |
168 | if lesson_match:
169 | # Process previous lesson if it exists
170 | if current_lesson is not None and lesson_content:
171 | lesson_text = '\n'.join(lesson_content).strip()
172 | if lesson_text:
173 | # Add lesson to course
174 | lesson = Lesson(
175 | lesson_number=current_lesson,
176 | title=lesson_title,
177 | lesson_link=lesson_link
178 | )
179 | course.lessons.append(lesson)
180 |
181 | # Create chunks for this lesson
182 | chunks = self.chunk_text(lesson_text)
183 | for idx, chunk in enumerate(chunks):
184 | # For the first chunk of each lesson, add lesson context
185 | if idx == 0:
186 | chunk_with_context = f"Lesson {current_lesson} content: {chunk}"
187 | else:
188 | chunk_with_context = chunk
189 |
190 | course_chunk = CourseChunk(
191 | content=chunk_with_context,
192 | course_title=course.title,
193 | lesson_number=current_lesson,
194 | chunk_index=chunk_counter
195 | )
196 | course_chunks.append(course_chunk)
197 | chunk_counter += 1
198 |
199 | # Start new lesson
200 | current_lesson = int(lesson_match.group(1))
201 | lesson_title = lesson_match.group(2).strip()
202 | lesson_link = None
203 |
204 | # Check if next line is a lesson link
205 | if i + 1 < len(lines):
206 | next_line = lines[i + 1].strip()
207 | link_match = re.match(r'^Lesson Link:\s*(.+)$', next_line, re.IGNORECASE)
208 | if link_match:
209 | lesson_link = link_match.group(1).strip()
210 | i += 1 # Skip the link line so it's not added to content
211 |
212 | lesson_content = []
213 | else:
214 | # Add line to current lesson content
215 | lesson_content.append(line)
216 |
217 | i += 1
218 |
219 | # Process the last lesson
220 | if current_lesson is not None and lesson_content:
221 | lesson_text = '\n'.join(lesson_content).strip()
222 | if lesson_text:
223 | lesson = Lesson(
224 | lesson_number=current_lesson,
225 | title=lesson_title,
226 | lesson_link=lesson_link
227 | )
228 | course.lessons.append(lesson)
229 |
230 | chunks = self.chunk_text(lesson_text)
231 | for idx, chunk in enumerate(chunks):
232 | # For any chunk of each lesson, add lesson context & course title
233 |
234 | chunk_with_context = f"Course {course_title} Lesson {current_lesson} content: {chunk}"
235 |
236 | course_chunk = CourseChunk(
237 | content=chunk_with_context,
238 | course_title=course.title,
239 | lesson_number=current_lesson,
240 | chunk_index=chunk_counter
241 | )
242 | course_chunks.append(course_chunk)
243 | chunk_counter += 1
244 |
245 | # If no lessons found, treat entire content as one document
246 | if not course_chunks and len(lines) > 2:
247 | remaining_content = '\n'.join(lines[start_index:]).strip()
248 | if remaining_content:
249 | chunks = self.chunk_text(remaining_content)
250 | for chunk in chunks:
251 | course_chunk = CourseChunk(
252 | content=chunk,
253 | course_title=course.title,
254 | chunk_index=chunk_counter
255 | )
256 | course_chunks.append(course_chunk)
257 | chunk_counter += 1
258 |
259 | return course, course_chunks
260 |
--------------------------------------------------------------------------------
/backend/vector_store.py:
--------------------------------------------------------------------------------
1 | import chromadb
2 | from chromadb.config import Settings
3 | from typing import List, Dict, Any, Optional
4 | from dataclasses import dataclass
5 | from models import Course, CourseChunk
6 | from sentence_transformers import SentenceTransformer
7 |
8 | @dataclass
9 | class SearchResults:
10 | """Container for search results with metadata"""
11 | documents: List[str]
12 | metadata: List[Dict[str, Any]]
13 | distances: List[float]
14 | error: Optional[str] = None
15 |
16 | @classmethod
17 | def from_chroma(cls, chroma_results: Dict) -> 'SearchResults':
18 | """Create SearchResults from ChromaDB query results"""
19 | return cls(
20 | documents=chroma_results['documents'][0] if chroma_results['documents'] else [],
21 | metadata=chroma_results['metadatas'][0] if chroma_results['metadatas'] else [],
22 | distances=chroma_results['distances'][0] if chroma_results['distances'] else []
23 | )
24 |
25 | @classmethod
26 | def empty(cls, error_msg: str) -> 'SearchResults':
27 | """Create empty results with error message"""
28 | return cls(documents=[], metadata=[], distances=[], error=error_msg)
29 |
30 | def is_empty(self) -> bool:
31 | """Check if results are empty"""
32 | return len(self.documents) == 0
33 |
34 | class VectorStore:
35 | """Vector storage using ChromaDB for course content and metadata"""
36 |
37 | def __init__(self, chroma_path: str, embedding_model: str, max_results: int = 5):
38 | self.max_results = max_results
39 | # Initialize ChromaDB client
40 | self.client = chromadb.PersistentClient(
41 | path=chroma_path,
42 | settings=Settings(anonymized_telemetry=False)
43 | )
44 |
45 | # Set up sentence transformer embedding function
46 | self.embedding_function = chromadb.utils.embedding_functions.SentenceTransformerEmbeddingFunction(
47 | model_name=embedding_model
48 | )
49 |
50 | # Create collections for different types of data
51 | self.course_catalog = self._create_collection("course_catalog") # Course titles/instructors
52 | self.course_content = self._create_collection("course_content") # Actual course material
53 |
54 | def _create_collection(self, name: str):
55 | """Create or get a ChromaDB collection"""
56 | return self.client.get_or_create_collection(
57 | name=name,
58 | embedding_function=self.embedding_function
59 | )
60 |
61 | def search(self,
62 | query: str,
63 | course_name: Optional[str] = None,
64 | lesson_number: Optional[int] = None,
65 | limit: Optional[int] = None) -> SearchResults:
66 | """
67 | Main search interface that handles course resolution and content search.
68 |
69 | Args:
70 | query: What to search for in course content
71 | course_name: Optional course name/title to filter by
72 | lesson_number: Optional lesson number to filter by
73 | limit: Maximum results to return
74 |
75 | Returns:
76 | SearchResults object with documents and metadata
77 | """
78 | # Step 1: Resolve course name if provided
79 | course_title = None
80 | if course_name:
81 | course_title = self._resolve_course_name(course_name)
82 | if not course_title:
83 | return SearchResults.empty(f"No course found matching '{course_name}'")
84 |
85 | # Step 2: Build filter for content search
86 | filter_dict = self._build_filter(course_title, lesson_number)
87 |
88 | # Step 3: Search course content
89 | # Use provided limit or fall back to configured max_results
90 | search_limit = limit if limit is not None else self.max_results
91 |
92 | try:
93 | results = self.course_content.query(
94 | query_texts=[query],
95 | n_results=search_limit,
96 | where=filter_dict
97 | )
98 | return SearchResults.from_chroma(results)
99 | except Exception as e:
100 | return SearchResults.empty(f"Search error: {str(e)}")
101 |
102 | def _resolve_course_name(self, course_name: str) -> Optional[str]:
103 | """Use vector search to find best matching course by name"""
104 | try:
105 | results = self.course_catalog.query(
106 | query_texts=[course_name],
107 | n_results=1
108 | )
109 |
110 | if results['documents'][0] and results['metadatas'][0]:
111 | # Return the title (which is now the ID)
112 | return results['metadatas'][0][0]['title']
113 | except Exception as e:
114 | print(f"Error resolving course name: {e}")
115 |
116 | return None
117 |
118 | def _build_filter(self, course_title: Optional[str], lesson_number: Optional[int]) -> Optional[Dict]:
119 | """Build ChromaDB filter from search parameters"""
120 | if not course_title and lesson_number is None:
121 | return None
122 |
123 | # Handle different filter combinations
124 | if course_title and lesson_number is not None:
125 | return {"$and": [
126 | {"course_title": course_title},
127 | {"lesson_number": lesson_number}
128 | ]}
129 |
130 | if course_title:
131 | return {"course_title": course_title}
132 |
133 | return {"lesson_number": lesson_number}
134 |
135 | def add_course_metadata(self, course: Course):
136 | """Add course information to the catalog for semantic search"""
137 | import json
138 |
139 | course_text = course.title
140 |
141 | # Build lessons metadata and serialize as JSON string
142 | lessons_metadata = []
143 | for lesson in course.lessons:
144 | lessons_metadata.append({
145 | "lesson_number": lesson.lesson_number,
146 | "lesson_title": lesson.title,
147 | "lesson_link": lesson.lesson_link
148 | })
149 |
150 | self.course_catalog.add(
151 | documents=[course_text],
152 | metadatas=[{
153 | "title": course.title,
154 | "instructor": course.instructor,
155 | "course_link": course.course_link,
156 | "lessons_json": json.dumps(lessons_metadata), # Serialize as JSON string
157 | "lesson_count": len(course.lessons)
158 | }],
159 | ids=[course.title]
160 | )
161 |
162 | def add_course_content(self, chunks: List[CourseChunk]):
163 | """Add course content chunks to the vector store"""
164 | if not chunks:
165 | return
166 |
167 | documents = [chunk.content for chunk in chunks]
168 | metadatas = [{
169 | "course_title": chunk.course_title,
170 | "lesson_number": chunk.lesson_number,
171 | "chunk_index": chunk.chunk_index
172 | } for chunk in chunks]
173 | # Use title with chunk index for unique IDs
174 | ids = [f"{chunk.course_title.replace(' ', '_')}_{chunk.chunk_index}" for chunk in chunks]
175 |
176 | self.course_content.add(
177 | documents=documents,
178 | metadatas=metadatas,
179 | ids=ids
180 | )
181 |
182 | def clear_all_data(self):
183 | """Clear all data from both collections"""
184 | try:
185 | self.client.delete_collection("course_catalog")
186 | self.client.delete_collection("course_content")
187 | # Recreate collections
188 | self.course_catalog = self._create_collection("course_catalog")
189 | self.course_content = self._create_collection("course_content")
190 | except Exception as e:
191 | print(f"Error clearing data: {e}")
192 |
193 | def get_existing_course_titles(self) -> List[str]:
194 | """Get all existing course titles from the vector store"""
195 | try:
196 | # Get all documents from the catalog
197 | results = self.course_catalog.get()
198 | if results and 'ids' in results:
199 | return results['ids']
200 | return []
201 | except Exception as e:
202 | print(f"Error getting existing course titles: {e}")
203 | return []
204 |
205 | def get_course_count(self) -> int:
206 | """Get the total number of courses in the vector store"""
207 | try:
208 | results = self.course_catalog.get()
209 | if results and 'ids' in results:
210 | return len(results['ids'])
211 | return 0
212 | except Exception as e:
213 | print(f"Error getting course count: {e}")
214 | return 0
215 |
216 | def get_all_courses_metadata(self) -> List[Dict[str, Any]]:
217 | """Get metadata for all courses in the vector store"""
218 | import json
219 | try:
220 | results = self.course_catalog.get()
221 | if results and 'metadatas' in results:
222 | # Parse lessons JSON for each course
223 | parsed_metadata = []
224 | for metadata in results['metadatas']:
225 | course_meta = metadata.copy()
226 | if 'lessons_json' in course_meta:
227 | course_meta['lessons'] = json.loads(course_meta['lessons_json'])
228 | del course_meta['lessons_json'] # Remove the JSON string version
229 | parsed_metadata.append(course_meta)
230 | return parsed_metadata
231 | return []
232 | except Exception as e:
233 | print(f"Error getting courses metadata: {e}")
234 | return []
235 |
236 | def get_course_link(self, course_title: str) -> Optional[str]:
237 | """Get course link for a given course title"""
238 | try:
239 | # Get course by ID (title is the ID)
240 | results = self.course_catalog.get(ids=[course_title])
241 | if results and 'metadatas' in results and results['metadatas']:
242 | metadata = results['metadatas'][0]
243 | return metadata.get('course_link')
244 | return None
245 | except Exception as e:
246 | print(f"Error getting course link: {e}")
247 | return None
248 |
249 | def get_lesson_link(self, course_title: str, lesson_number: int) -> Optional[str]:
250 | """Get lesson link for a given course title and lesson number"""
251 | import json
252 | try:
253 | # Get course by ID (title is the ID)
254 | results = self.course_catalog.get(ids=[course_title])
255 | if results and 'metadatas' in results and results['metadatas']:
256 | metadata = results['metadatas'][0]
257 | lessons_json = metadata.get('lessons_json')
258 | if lessons_json:
259 | lessons = json.loads(lessons_json)
260 | # Find the lesson with matching number
261 | for lesson in lessons:
262 | if lesson.get('lesson_number') == lesson_number:
263 | return lesson.get('lesson_link')
264 | return None
265 | except Exception as e:
266 | print(f"Error getting lesson link: {e}")
267 |
--------------------------------------------------------------------------------
/frontend/style.css:
--------------------------------------------------------------------------------
1 | /* Modern CSS Reset */
2 | *, *::before, *::after {
3 | box-sizing: border-box;
4 | margin: 0;
5 | padding: 0;
6 | }
7 |
8 | /* CSS Variables */
9 | :root {
10 | --primary-color: #2563eb;
11 | --primary-hover: #1d4ed8;
12 | --background: #0f172a;
13 | --surface: #1e293b;
14 | --surface-hover: #334155;
15 | --text-primary: #f1f5f9;
16 | --text-secondary: #94a3b8;
17 | --border-color: #334155;
18 | --user-message: #2563eb;
19 | --assistant-message: #374151;
20 | --shadow: 0 4px 6px -1px rgba(0, 0, 0, 0.3);
21 | --radius: 12px;
22 | --focus-ring: rgba(37, 99, 235, 0.2);
23 | --welcome-bg: #1e3a5f;
24 | --welcome-border: #2563eb;
25 | }
26 |
27 | /* Base Styles */
28 | body {
29 | font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif;
30 | background-color: var(--background);
31 | color: var(--text-primary);
32 | line-height: 1.6;
33 | height: 100vh;
34 | overflow: hidden;
35 | margin: 0;
36 | padding: 0;
37 | }
38 |
39 | /* Container - Full Screen */
40 | .container {
41 | height: 100vh;
42 | width: 100vw;
43 | display: flex;
44 | flex-direction: column;
45 | margin: 0;
46 | padding: 0;
47 | }
48 |
49 | /* Header - Hidden */
50 | header {
51 | display: none;
52 | }
53 |
54 | header h1 {
55 | font-size: 1.75rem;
56 | font-weight: 700;
57 | background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
58 | -webkit-background-clip: text;
59 | -webkit-text-fill-color: transparent;
60 | background-clip: text;
61 | margin: 0;
62 | }
63 |
64 | .subtitle {
65 | font-size: 0.95rem;
66 | color: var(--text-secondary);
67 | margin-top: 0.5rem;
68 | }
69 |
70 | /* Main Content Area with Sidebar */
71 | .main-content {
72 | flex: 1;
73 | display: flex;
74 | overflow: hidden;
75 | background: var(--background);
76 | }
77 |
78 | /* Left Sidebar */
79 | .sidebar {
80 | width: 320px;
81 | background: var(--surface);
82 | border-right: 1px solid var(--border-color);
83 | padding: 1.5rem;
84 | overflow-y: auto;
85 | flex-shrink: 0;
86 | }
87 |
88 | /* Custom Scrollbar for Sidebar */
89 | .sidebar::-webkit-scrollbar {
90 | width: 8px;
91 | }
92 |
93 | .sidebar::-webkit-scrollbar-track {
94 | background: var(--surface);
95 | }
96 |
97 | .sidebar::-webkit-scrollbar-thumb {
98 | background: var(--border-color);
99 | border-radius: 4px;
100 | }
101 |
102 | .sidebar::-webkit-scrollbar-thumb:hover {
103 | background: var(--text-secondary);
104 | }
105 |
106 | .sidebar-section {
107 | margin-bottom: 1.5rem;
108 | }
109 |
110 | .sidebar-section:last-child {
111 | margin-bottom: 0;
112 | }
113 |
114 | /* Main Chat Area */
115 | .chat-main {
116 | flex: 1;
117 | display: flex;
118 | justify-content: center;
119 | overflow: hidden;
120 | padding: 0;
121 | background: var(--background);
122 | }
123 |
124 | /* Chat Container - Centered with Max Width */
125 | .chat-container {
126 | flex: 1;
127 | display: flex;
128 | flex-direction: column;
129 | background: var(--background);
130 | overflow: hidden;
131 | width: 100%;
132 | max-width: 800px;
133 | margin: 0;
134 | }
135 |
136 | /* Chat Messages */
137 | .chat-messages {
138 | flex: 1;
139 | overflow-y: auto;
140 | padding: 2rem;
141 | display: flex;
142 | flex-direction: column;
143 | gap: 1rem;
144 | background: var(--background);
145 | }
146 |
147 | /* Custom Scrollbar */
148 | .chat-messages::-webkit-scrollbar {
149 | width: 8px;
150 | }
151 |
152 | .chat-messages::-webkit-scrollbar-track {
153 | background: var(--surface);
154 | }
155 |
156 | .chat-messages::-webkit-scrollbar-thumb {
157 | background: var(--border-color);
158 | border-radius: 4px;
159 | }
160 |
161 | .chat-messages::-webkit-scrollbar-thumb:hover {
162 | background: var(--text-secondary);
163 | }
164 |
165 | /* Message Styles */
166 | .message {
167 | max-width: 85%;
168 | animation: fadeIn 0.3s ease-out;
169 | }
170 |
171 | @keyframes fadeIn {
172 | from {
173 | opacity: 0;
174 | transform: translateY(10px);
175 | }
176 | to {
177 | opacity: 1;
178 | transform: translateY(0);
179 | }
180 | }
181 |
182 | .message.user {
183 | align-self: flex-end;
184 | }
185 |
186 | .message.assistant {
187 | align-self: flex-start;
188 | }
189 |
190 | .message-content {
191 | padding: 0.75rem 1.25rem;
192 | border-radius: 18px;
193 | word-wrap: break-word;
194 | line-height: 1.5;
195 | }
196 |
197 | .message.user .message-content {
198 | background: var(--user-message);
199 | color: white;
200 | border-bottom-right-radius: 4px;
201 | }
202 |
203 | .message.assistant .message-content {
204 | background: var(--surface);
205 | color: var(--text-primary);
206 | border-bottom-left-radius: 4px;
207 | }
208 |
209 | /* Message metadata */
210 | .message-meta {
211 | font-size: 0.75rem;
212 | color: var(--text-secondary);
213 | margin-top: 0.25rem;
214 | padding: 0 0.5rem;
215 | }
216 |
217 | .message.user .message-meta {
218 | text-align: right;
219 | }
220 |
221 | /* Collapsible Sources */
222 | .sources-collapsible {
223 | margin-top: 0.5rem;
224 | font-size: 0.75rem;
225 | color: var(--text-secondary);
226 | }
227 |
228 | .sources-collapsible summary {
229 | cursor: pointer;
230 | padding: 0.25rem 0.5rem;
231 | user-select: none;
232 | font-weight: 500;
233 | }
234 |
235 | .sources-collapsible summary:hover {
236 | color: var(--text-primary);
237 | }
238 |
239 | .sources-collapsible[open] summary {
240 | margin-bottom: 0.25rem;
241 | }
242 |
243 | .sources-content {
244 | padding: 0 0.5rem 0.25rem 1.5rem;
245 | color: var(--text-secondary);
246 | }
247 |
248 | /* Markdown formatting styles */
249 | .message-content h1,
250 | .message-content h2,
251 | .message-content h3,
252 | .message-content h4,
253 | .message-content h5,
254 | .message-content h6 {
255 | margin: 0.5rem 0;
256 | font-weight: 600;
257 | }
258 |
259 | .message-content h1 { font-size: 1.5rem; }
260 | .message-content h2 { font-size: 1.3rem; }
261 | .message-content h3 { font-size: 1.1rem; }
262 |
263 | .message-content p {
264 | margin: 0.5rem 0;
265 | line-height: 1.6;
266 | }
267 |
268 | .message-content ul,
269 | .message-content ol {
270 | margin: 0.5rem 0;
271 | padding-left: 1.5rem;
272 | }
273 |
274 | .message-content li {
275 | margin: 0.25rem 0;
276 | line-height: 1.6;
277 | }
278 |
279 | .message-content code {
280 | background-color: rgba(0, 0, 0, 0.2);
281 | padding: 0.125rem 0.25rem;
282 | border-radius: 3px;
283 | font-family: 'Fira Code', 'Consolas', monospace;
284 | font-size: 0.875em;
285 | }
286 |
287 | .message-content pre {
288 | background-color: rgba(0, 0, 0, 0.2);
289 | padding: 0.75rem;
290 | border-radius: 4px;
291 | overflow-x: auto;
292 | margin: 0.5rem 0;
293 | }
294 |
295 | .message-content pre code {
296 | background-color: transparent;
297 | padding: 0;
298 | }
299 |
300 | .message-content blockquote {
301 | border-left: 3px solid var(--primary);
302 | padding-left: 1rem;
303 | margin: 0.5rem 0;
304 | color: var(--text-secondary);
305 | }
306 |
307 | /* Welcome message special styling */
308 | .message.welcome-message .message-content {
309 | background: var(--surface);
310 | border: 2px solid var(--border-color);
311 | box-shadow: 0 4px 16px rgba(0, 0, 0, 0.2);
312 | position: relative;
313 | }
314 |
315 | .message-content strong {
316 | font-weight: 600;
317 | }
318 |
319 | .message-content em {
320 | font-style: italic;
321 | }
322 |
323 | .message-content hr {
324 | border: none;
325 | border-top: 1px solid var(--border-color);
326 | margin: 1rem 0;
327 | }
328 |
329 | /* Chat Input Container */
330 | .chat-input-container {
331 | display: flex;
332 | gap: 0.75rem;
333 | padding: 1.5rem 2rem;
334 | background: var(--background);
335 | border-top: 1px solid var(--border-color);
336 | flex-shrink: 0;
337 | }
338 |
339 | /* Chat Input */
340 | #chatInput {
341 | flex: 1;
342 | padding: 0.875rem 1.25rem;
343 | background: var(--surface);
344 | border: 1px solid var(--border-color);
345 | border-radius: 24px;
346 | color: var(--text-primary);
347 | font-size: 0.95rem;
348 | transition: all 0.2s ease;
349 | }
350 |
351 | #chatInput:focus {
352 | outline: none;
353 | border-color: var(--primary-color);
354 | box-shadow: 0 0 0 3px var(--focus-ring);
355 | }
356 |
357 | #chatInput::placeholder {
358 | color: var(--text-secondary);
359 | }
360 |
361 | /* Send Button */
362 | #sendButton {
363 | padding: 0.75rem 1.25rem;
364 | background: var(--primary-color);
365 | color: white;
366 | border: none;
367 | border-radius: 24px;
368 | cursor: pointer;
369 | display: flex;
370 | align-items: center;
371 | justify-content: center;
372 | transition: all 0.2s ease;
373 | min-width: 52px;
374 | }
375 |
376 | #sendButton:focus {
377 | outline: none;
378 | box-shadow: 0 0 0 3px var(--focus-ring);
379 | }
380 |
381 | #sendButton:hover:not(:disabled) {
382 | background: var(--primary-hover);
383 | transform: translateY(-1px);
384 | box-shadow: 0 4px 12px rgba(37, 99, 235, 0.3);
385 | }
386 |
387 | #sendButton:active:not(:disabled) {
388 | transform: translateY(0);
389 | }
390 |
391 | #sendButton:disabled {
392 | opacity: 0.5;
393 | cursor: not-allowed;
394 | }
395 |
396 | /* Loading Animation */
397 | .loading {
398 | display: inline-flex;
399 | gap: 4px;
400 | padding: 0.75rem 1.25rem;
401 | }
402 |
403 | .loading span {
404 | width: 8px;
405 | height: 8px;
406 | background: var(--text-secondary);
407 | border-radius: 50%;
408 | animation: bounce 1.4s infinite ease-in-out both;
409 | }
410 |
411 | .loading span:nth-child(1) {
412 | animation-delay: -0.32s;
413 | }
414 |
415 | .loading span:nth-child(2) {
416 | animation-delay: -0.16s;
417 | }
418 |
419 | @keyframes bounce {
420 | 0%, 80%, 100% {
421 | transform: scale(0);
422 | }
423 | 40% {
424 | transform: scale(1);
425 | }
426 | }
427 |
428 | /* Error Message */
429 | .error-message {
430 | background: rgba(239, 68, 68, 0.1);
431 | color: #f87171;
432 | padding: 0.75rem 1.25rem;
433 | border-radius: 8px;
434 | border: 1px solid rgba(239, 68, 68, 0.2);
435 | margin: 0.5rem 0;
436 | }
437 |
438 | /* Success Message */
439 | .success-message {
440 | background: rgba(34, 197, 94, 0.1);
441 | color: #4ade80;
442 | padding: 0.75rem 1.25rem;
443 | border-radius: 8px;
444 | border: 1px solid rgba(34, 197, 94, 0.2);
445 | margin: 0.5rem 0;
446 | }
447 |
448 | /* Sidebar Headers */
449 | .stats-header,
450 | .suggested-header {
451 | font-size: 0.875rem;
452 | font-weight: 600;
453 | color: var(--text-secondary);
454 | cursor: pointer;
455 | padding: 0.5rem 0;
456 | border: none;
457 | background: none;
458 | list-style: none;
459 | outline: none;
460 | transition: color 0.2s ease;
461 | text-transform: uppercase;
462 | letter-spacing: 0.5px;
463 | }
464 |
465 | .stats-header:focus,
466 | .suggested-header:focus {
467 | color: var(--primary-color);
468 | }
469 |
470 | .stats-header:hover,
471 | .suggested-header:hover {
472 | color: var(--primary-color);
473 | }
474 |
475 | .stats-header::-webkit-details-marker,
476 | .suggested-header::-webkit-details-marker {
477 | display: none;
478 | }
479 |
480 | .stats-header::before,
481 | .suggested-header::before {
482 | content: '▶';
483 | display: inline-block;
484 | margin-right: 0.5rem;
485 | transition: transform 0.2s ease;
486 | font-size: 0.75rem;
487 | }
488 |
489 | details[open] .stats-header::before,
490 | details[open] .suggested-header::before {
491 | transform: rotate(90deg);
492 | }
493 |
494 | /* Course Stats in Sidebar */
495 | .course-stats {
496 | display: flex;
497 | flex-direction: column;
498 | gap: 1rem;
499 | padding: 0.75rem 0;
500 | background: transparent;
501 | border: none;
502 | }
503 |
504 | .stat-item {
505 | text-align: left;
506 | padding: 0.75rem;
507 | background: var(--background);
508 | border-radius: 8px;
509 | border: 1px solid var(--border-color);
510 | margin-bottom: 0.75rem;
511 | }
512 |
513 | .stat-item:last-child {
514 | margin-bottom: 0;
515 | }
516 |
517 | .stat-value {
518 | display: inline-block;
519 | font-size: 0.875rem;
520 | font-weight: 600;
521 | color: var(--primary-color);
522 | margin-left: 0.5rem;
523 | }
524 |
525 | .stat-label {
526 | display: inline-block;
527 | font-size: 0.875rem;
528 | color: var(--text-secondary);
529 | font-weight: 600;
530 | }
531 |
532 | .stat-item:last-child .stat-label {
533 | display: block;
534 | margin-bottom: 0.5rem;
535 | }
536 |
537 | /* Course titles collapsible */
538 | .course-titles-collapsible {
539 | width: 100%;
540 | }
541 |
542 | .course-titles-header {
543 | cursor: pointer;
544 | font-size: 0.875rem;
545 | color: var(--text-secondary);
546 | font-weight: 600;
547 | padding: 0.5rem 0;
548 | list-style: none;
549 | display: block;
550 | user-select: none;
551 | }
552 |
553 | .course-titles-header:focus {
554 | outline: none;
555 | color: var(--primary-color);
556 | }
557 |
558 | .course-titles-header::-webkit-details-marker {
559 | display: none;
560 | }
561 |
562 | .course-titles-header::before {
563 | content: '▶';
564 | display: inline-block;
565 | margin-right: 0.5rem;
566 | transition: transform 0.2s ease;
567 | font-size: 0.75rem;
568 | }
569 |
570 | .course-titles-collapsible[open] .course-titles-header::before {
571 | transform: rotate(90deg);
572 | }
573 |
574 | /* Course titles display */
575 | .course-titles {
576 | margin-top: 0.5rem;
577 | /* Remove max-height to show all titles without scrolling */
578 | }
579 |
580 | .course-title-item {
581 | font-size: 0.85rem;
582 | color: var(--text-primary);
583 | padding: 0.5rem 0.25rem;
584 | border-bottom: 1px solid var(--border-color);
585 | text-transform: none;
586 | line-height: 1.4;
587 | }
588 |
589 | .course-title-item:last-child {
590 | border-bottom: none;
591 | }
592 |
593 | .course-title-item:first-child {
594 | padding-top: 0.25rem;
595 | }
596 |
597 | .no-courses, .loading, .error {
598 | font-size: 0.85rem;
599 | color: var(--text-secondary);
600 | font-style: italic;
601 | text-transform: none;
602 | }
603 |
604 | /* Suggested Questions in Sidebar */
605 | .suggested-items {
606 | display: flex;
607 | flex-direction: column;
608 | gap: 0.5rem;
609 | padding: 0.75rem 0;
610 | }
611 |
612 | .suggested-item {
613 | padding: 0.75rem 1rem;
614 | background: var(--background);
615 | border: 1px solid var(--border-color);
616 | border-radius: 8px;
617 | color: var(--text-primary);
618 | font-size: 0.875rem;
619 | cursor: pointer;
620 | transition: all 0.2s ease;
621 | text-align: left;
622 | width: 100%;
623 | }
624 |
625 | .suggested-item:focus {
626 | outline: none;
627 | box-shadow: 0 0 0 3px var(--focus-ring);
628 | }
629 |
630 | .suggested-item:hover {
631 | background: var(--surface-hover);
632 | border-color: var(--primary-color);
633 | color: var(--primary-color);
634 | transform: translateX(2px);
635 | }
636 |
637 | /* Responsive Design */
638 | @media (max-width: 768px) {
639 | .main-content {
640 | flex-direction: column;
641 | }
642 |
643 | .sidebar {
644 | width: 100%;
645 | border-right: none;
646 | border-bottom: 1px solid var(--border-color);
647 | padding: 1rem;
648 | order: 2;
649 | max-height: 40vh;
650 | }
651 |
652 | .sidebar::-webkit-scrollbar {
653 | width: 8px;
654 | }
655 |
656 | .sidebar::-webkit-scrollbar-track {
657 | background: var(--surface);
658 | }
659 |
660 | .sidebar::-webkit-scrollbar-thumb {
661 | background: var(--border-color);
662 | border-radius: 4px;
663 | }
664 |
665 | .sidebar::-webkit-scrollbar-thumb:hover {
666 | background: var(--text-secondary);
667 | }
668 |
669 | .chat-main {
670 | order: 1;
671 | }
672 |
673 | header {
674 | padding: 1rem;
675 | }
676 |
677 | header h1 {
678 | font-size: 1.5rem;
679 | }
680 |
681 | .chat-messages {
682 | padding: 1rem;
683 | }
684 |
685 | .message {
686 | max-width: 90%;
687 | }
688 |
689 | .chat-input-container {
690 | padding: 1rem;
691 | gap: 0.5rem;
692 | }
693 |
694 | #chatInput {
695 | padding: 0.75rem 1rem;
696 | font-size: 0.9rem;
697 | }
698 |
699 | #sendButton {
700 | padding: 0.75rem 1rem;
701 | min-width: 48px;
702 | }
703 |
704 | .stat-value {
705 | font-size: 1.25rem;
706 | }
707 |
708 | .suggested-item {
709 | padding: 0.5rem 0.75rem;
710 | font-size: 0.8rem;
711 | }
712 | }
713 |
714 | @media (max-width: 1024px) {
715 | .sidebar {
716 | width: 280px;
717 | }
718 | }
719 |
--------------------------------------------------------------------------------
/docs/course3_script.txt:
--------------------------------------------------------------------------------
1 | Course Title: Advanced Retrieval for AI with Chroma
2 | Course Link: https://www.deeplearning.ai/short-courses/advanced-retrieval-for-ai/
3 | Course Instructor: Anton Troynikov
4 |
5 | Lesson 0: Introduction
6 | Lesson Link: https://learn.deeplearning.ai/courses/advanced-retrieval-for-ai/lesson/kb5oj/introduction
7 | Rag or Retrieval Augmented generation retrieves relevant documents to give contexts and lm. And this makes it much better at answering queries and performing tasks. Many teams are using simple retrieval techniques based on semantic similarity or embeddings, but you learned more sophisticated techniques in this course, which let you do much better than that. A common workflow in Rag is to take your query and embed that, then find the most similar documents, meaning ones with similar embeddings. And that's the context. But the problem with that is that it can tend to find documents that talk about similar topics as a query, but not actually contain the answer. But you can take the initial user query and rewrite. This is called query expansion. Rewrite it to put in more directly related documents. Two key related techniques. One to expand the original query into multiple queries by rewording or rewriting it in different ways. And second, to even guess or hypothesize what the answer might look like. To see if we can find anything in a document collection that looks more like an answer, rather than only generally talking about the topics of the query. I'm delighted the instructor for this course is Anton trying to call. Anton has been one of the innovators driving for the Soviets and retrieval for AI applications. He is co-founder of Chroma, which provides one of the most popular open source vector databases. If you've taken one of our Lansing short courses taught by Harrison Chase, you have very likely use chroma. Thank you Andrew. I'm really excited to be working with you on this course and share what I'm seeing out in the field in terms of what does and doesn't work in Rag deployments. We'll start off the course by doing a quick review of Rag applications. You will then learn about some of the pitfalls of retrieval where simple vector search doesn't do well. Then you'll learn several methods to improve the results. As Andrew mentioned, the first methods use an LM to improve the query itself. Another method, Rerank query, results with help from something called a cross encoder, which takes in a pair of sentences and produces a relevancy score. You'll also learn how to adapt the query embeddings based on user feedback to produce more relevant results. There's a lot of innovation going on in Rag right now. So in the final lesson, we'll also go over some of the cutting edge techniques that aren't mainstream yet and are only just now appearing in research. And I think they will become much more mainstream soon. We'd like to acknowledge some of the folks who have worked on this course from the Chrome team. We'd like to thank Jeff Hoover, Hamad Bashar B Trump and Ben Eggers, as well as Chrome is open source developer community from the Deep Learning team. We have Jeff Lodwick and Mark Gregory. The first lesson starts with an overview of wreck. I hope you go on to watch that right after this. And with these techniques, it turns out, is possible for smaller teams than ever to build effective systems. So after this course, you might be to build something really cool with an approach that previously would have been considered rag tag.
8 | Lesson 1: Overview Of Embeddings Based Retrieval
9 | Lesson Link: https://learn.deeplearning.ai/courses/advanced-retrieval-for-ai/lesson/ukzj4/overview-of-embeddings-based-retrieval
10 | In this first lesson, we're going to review some of the elements in an embeddings based retrieval system and how that fits together in a retrieval augmented generation loop, together with an LM. So let's go. So the first thing I'd like to show you is the overall system diagram of how this works in practice. The way retrieval augmented generation works is you have some user query that comes in, and you have a set of documents that you've previously embedded and stored in your retrieval system in this case. You take your query, you run your queries through the same embedding model as you use to embed your documents, which generates an embedding. You embed declaring, and then the retrieval system finds the most relevant documents according to the embedding of that query, by finding the nearest neighbor embeddings of those documents. We then return both the query and the relevant documents to the LM, and the LM synthesizes information from the retrieved documents to generate an answer. Let's show how this works in practice. To start with, we're going to pull in some helper functions from our utilities. This function just basically is a basic word wrap function, which allows us to look at the documents in a nicely pretty printed way. And the example that we're going to use, we're going to read from a PDF. So we're going to pull in PDF reader. This is a really simple Python package that you can easily import. It's open source. And we're going to read from Microsoft's 2022 annual report. And so to do that we're going to extract the texts from the report using this PDF reader application. And all we're doing here is for every page that the reader has, we're extracting the text from that page, and we're also stripping the whitespace characters from those pages. Now, the other important thing that we really need to do is make sure that we're not sending any empty strings. There aren't any empty pages that we send to our retrieval system. So we're going to filter out those as well. And this little loop just basically checks to see if there's an empty string. And if there is we don't add it to the final list of PDF texts. And so just to show the sort of output that we get here, we'll print an example. And what we'll do is print the output of the first page of extracted text from this PDF. Here we are. And this is what the PDF reader has extracted as text from the first page of the document. So in our next step we need to chunk of these pages first by character and by token. To do that we're going to grab some useful utilities for link chain. We're going to use some Lang chain text splitters. We're going to use the recursive character text splitter and the sentence transformers token text splitter. It's important that we use the sentence Transformers token text splitter. And I'll explain why in just a moment. But first, let's start with the character splitter. The character splitter allows us to divide text recursively according to certain divider characters. And what that practically means is first, in each presented piece of text, the recursive character text splitter will find the double newlines and split on double newlines, and then if the chunks that got split are still larger than our target chunk size, in this case 1000 characters, it will use the next character to split them, then the next character, then justice space, and finally it will split just on character boundaries itself. Then we've also selected a chunk overlap of zero. This is a hyperparameter that you can play with to decide what optimal chunking looks like for you. So let's go ahead and run this. And we're going to output the output of the character text splitter. We're going to look at the 10th text split chunk that we got. And we're also going to output the total number of chunks that the character splitter gives us. So let's run this cell and take a look at the output. So we see the 10th chunk is all of this text according to the character recursive character text splitter. And this 347 chunks in total from this annual report PDF. So now we split by character. The character text splitting isn't quite enough, and the reason for that is because the embedding model, which we use called sentence transformers, has a limited context window widths. In fact, it uses 256 characters. That's the maximum context window length of our embedding model. This is a minor pitfall if you're not used to working with embeddings, you may not consider the embedding model context window itself. But it's very, very important because typically an embedding model will simply truncate any characters or tokens that are beyond its context window. So to make sure that we're actually capturing all the meaning in each chunk when we go to embed it, it's very important that we also chunk according to the token count. And what we're doing here is we're using the sentence transformer text splitter again with a chunk of overlap of zero. And we're using 256 tokens per chunk, which is the context window length of the sentence transformer embedding model. And I'll go into more detail about that embedding model in a little bit. And we are essentially taking all of the chunks that were generated by the character text splitter. And we are splitting them using the token text splitter. Let's put out similar output to what we had in the last cell and see what we observe here. Should we see a similar chunk? It's a little bit different to what we got before. Obviously it's fewer characters because we have only 256 tokens. This is again the 10th chunk. And we notice that we have a couple more chunks than we had before. In the previous output we had 347 chunks. In this output we have 349. So it's divided a couple of the existing chunks into more pieces. So we have our text chunks. That's the first step in any retrieval augmented generation system. The next step is to load the chunks that we have into our retrieval. And in this case we'll be using chroma. So to use chroma we need to import from itself. And we're going to use the sentence transformer embedding model as promised. Now let's talk a little bit about the sentence transformer embedding model and what this actually means and what an embedding model really actually even is. So the sentence transformer embedding model is essentially an extension of the Bert transformer architecture. The Bert architecture embeds each token individually. So here we have the classifier instruction token. And then I like dogs. Each token receives its own dense vector embedding. What a sentence transformer does is allow you to embed entire sentences or even small documents like we have here. By pooling the output of all the token embeddings to produce a single dense vector or per document, or in our case, per chunk sentence, transformers are great as an embedding model. They're open source, all the weights are available online, and they're really easy to run locally. They come built into chrome, and you can learn more about them by looking up the sentence Transformers website or taking a look at the linked paper. So that's why we're using sentence Transformers. And now hopefully it makes sense why we use the sentence transformer tokenizer text layer. So what we're going to do is we're going to create a sentence transformer embedding function. This is for use with comma. And we're going to demonstrate basically what happens when this embedding function gets called. So that's the output of this. So let's take a look. Now you may get this warning about hugging face tokenizers. This is a minor bug in hugging face. This is nothing to worry about. Perfectly normal. And here's the output that we get. And you can see this is one very, very long vector. It's a dense vector. Every entry in the vector has a number associated with it. And this is the representation of the 10th text chunk that we showed you before as a dense vector. This vector has 358 dimensions, which sounds like a lot unless you consider the full dimensionality of all English text, which is much, much higher. So the next step is to set up chroma. We're going to use the default chroma client, which is great if you're just experimenting in a notebook. And we're going to make a new chroma collection. And the collection is going to be called Microsoft Annual Report 2022. And we're also going to pass in our embedding function, which we defined before, which as you remember is a sentence transformer embedding function. We are going to create IDs for each of the text chunks that we've created. And they're just going to be the string of the number of their position in the total token split texts. And then what we're going to do is we're going to add those documents to our chroma collection and to make sure that everything is is the way we expect. Let's just output the count. After everything has been added. And let's run this cell. So now that we have everything loaded into chroma, let's connect and learn and build a full fledged Rag system. But we're going to demonstrate how querying and retrieval and LMS all work together. So let's start with a pretty simple query. I think if you're reading an annual financial report, one of the top questions you have in mind is what was the total revenue for this year? And what we're going to do is we're going to get some results from chroma by querying it. And we see here that we call query on our collection. We pass our query texts and we're asking for five results. Chroma. Under the hood we'll use the embedding function that you've defined on your collection to automatically embed the query for you. So you don't really have to do anything else to call that embedding function again. And we're going to pull the retrieve documents out of the results. This zero on the end is basically saying give me the results for the zero query. We only have the one query. And what we're going to output now is basically the retrieve documents themselves. And take a look. So let's run the cell. And we see that the documents that we get here are fairly relevant to our query. What was the total revenue we have. Classified revenue by different product and service offerings. We're talking about an unearned revenue. And there's more information in a similar vein. So the next step is to use this results together with an LLM to answer our query. We're going to use GPT for this. And we need to just do a little bit of set up so that we can have an open AI client. We're going to load our OpenAI API key from the environment so that we can authenticate. And we're going to create an OpenAI client. This is using their new version one API where they've wrapped everything in this one nice client object for us. So running the cell there won't be any output here, but everything's ready to go. Now we're going to define a function that allows us to call out to the model using our retrieved results, along with our query. We're going to use GPT 3.5 turbo, which does a reasonably good job in rack loops and is fairly quick and fast. So the first thing is we're going to pass in our query and retrieve documents. We're going to just join our retrieve documents into a single string called information. And we're gonna use the double new line to do so we're going to set up some messages. So the first thing is the system prompt. The system prompt essentially defines how the model should behave in response to your input. And here we're saying you are helpful expert my natural research assistant. Your users are asking questions about information contained in an annual report. You'll be shown the user's question and the relevant information from the annual report. Answer the user's question using only this information. So what this is doing, and this is really the core of the entire loop. We're turning GPT from a model that remembers facts into a model that processes information. That's the system prompt. And now we're going to add another piece of the message for our user content. And we have here that we're in the role of the user. And here's the content. The content is essentially a format of string that says here's our question. And that's just our original query. Here's the information you're supposed to use. Here's the information. Then we need to send the request to the OpenAI client, which is just using the normal API from the client. There's nothing special here at all. We're specifying the model. We're sending the messages. We're basically calling the chat completion endpoint on the OpenAI client, specifying a model and the messages we'd like to send and getting the response back. And then we need to do a little bit more just to unpack the response from what the client returns. So we have defined our function and now let's actually use it. Let's put everything together. So here's what we're going to do. We are going to say output is equal to calling read without query and retrieve documents. Then we're just going to print the word wrapped output. And far away. Finally there we go. The total revenue for the year ended June 30th, 2022 was $198,270 million for Microsoft. Microsoft are doing pretty well now. It's a good time to take a moment and try some of your own queries. So remember we specified the query a little bit further up. What was the total revenue? Try some of your own and see what the model outputs based on the retrieved results from the annual report. I think it's actually really important to play with your retrieval system to gain intuition about what the model and the retriever can and can't do together. Before we dive into really analyzing how the system works in the next lab, we're going to talk about some of the pitfalls in common failure modes of using retrieval in a retrieval augmented generation loop.
11 | Lesson 2: Pitfalls Of Retrieval - When Simple Vector Search Fails
12 | Lesson Link: https://learn.deeplearning.ai/courses/advanced-retrieval-for-ai/lesson/s49c1/pitfalls-of-retrieval---when-simple-vector-search-fails
13 | In this lesson, we're going to learn a little bit about some of the pitfalls of retrieval with vectors. I want to show you some cases where simple vector search really isn't enough to make retrieval work for your AI application. Just because things are semantically close as vectors under a particular embedding model doesn't always mean you're going to get good results right out of the box. Let's take a look. First thing we need to do is just get set up. Our helper utilities this time will let us load up everything we need to load from chroma, have the right embedding function ready to go, and we're just going to do a little bit of setup. So again we're going to create the same embedding function. And we're going to use our helper function this time to load our chroma collection. And we're just going to output the count to make sure we've got the right number of vectors in there. And again don't worry about any of these warnings you might see. So yep, that's the right output through us 349 chunks embedded in chroma. So one thing that I personally find useful is to visualize the embedding space. Remember that embeddings and their vectors are a geometric data structure and you can reason about them spatially. Obviously embeddings are very high dimensional. Sentence transformer embeddings have 348 dimensions like we talked about, but we can project them down into two dimensions which humans can visualize. And this can be useful for reasoning about the structure of embedding space. To do that, we're going to use something called Umap. U map is uniform manifold approximation. And it's an open source library that you can use exactly for projecting high dimensional data down into two dimensions or three dimensions, so that you can visualize it. This is a similar technique to something like PCA or t-SNE, except you map explicitly tries to preserve the structure of the data in terms of the distances between points as much as it can. Unlike, for example, PCA, which just tries to find the dominant directions and project data down in that way. So we're going to import Umap and we'll grab numpy and we'll grab tkm. If you don't know what to cdms, it's a little thing that basically shows you a percentage bar, when you have some long running process. I like to use this so that I know how long the iterations are taking and how much longer I might be waiting. And we're going to grab all of the embeddings out of the Corona collection. And what we're going to do is we're going to fit a U map transform. So again, you map is basically a model which fits a manifold to your data to projected down into two dimensions. We're setting the random seed to zero here just so that we can get reproducible results. And we get the same projection every time. So let's go ahead and fit that transform. And again don't worry about any warnings you might get here. Now in this next step, now that we fitted the transform we're going to use the transform to project the embeddings. And we're going to define a function that does that. We're going to call it project embeddings. And it takes as input an array of embeddings. And it takes the transform itself. And we're going to start by declaring an empty array empty numpy array, of the same length of as our embeddings array, but with dimension two, because we're just going to get two dimensional projections out. And what we're going to do is we're going to project the embeddings one by one. The reason to do it one by one is just so that we get consistent behavior from you. Map the way that you map does. Projection is somewhat sensitive to its inputs. So to ensure that we have reproducible results, we're just going to project one at a time instead of in batches. And then of course we're just going to return the result of the function just the way that you would expect having defined the function. Let's run it on our data set. And this will take a minute. Great. So now that process is finished, let's project the embeddings and actually take a look at them. So we're going to grab matplotlib. And probably most of you are fairly familiar with matplotlib I know. We're going to make a figure. We're just going to do a scatter plot, of the projected embeddings. Now. So you can see we have predicted data set embeddings, the first element of each one and the second element of each one. And we're going to make them size ten just because it's visually pleasing. We're going to set some other properties of our axes. And there we go. And this is what our data set looks like inside chroma projected down to two dimensions. And you can see that we preserve some structure a little bit more advanced visualization would allow you to sort of hover over each of each of these dots and see what's actually in there, and you would see that things with similar meanings end up next to each other, even in the projection. Sometimes they're a little bit unusual structures, because a two dimensional projection cannot represent all of the structure of the higher dimensional space. But as I said, it is useful for visualization. And one thing that it's useful for is to bring your own thinking into a more sort of geometric setting and actually think about vectors and points, which is what embedding space retrieval is really all about. So what evaluating the quality and performance of a retrieval system is all about is actually relevancy and distraction. So let's take a look at our original query again the one that we used in our example. What's the total revenue. And we're going to do just the same thing as we did last time. We're going to query the chroma collection using this query. As for five results, then we're going to include the documents and the embeddings because we'd like to use those embeddings for visualization. And so we're going to grab our retrieve documents out of the results again. And let's print them out. And we see again the same results as we saw before. Retrieval is deterministic in this case. And we see that there are several revenue related documents. But also there are things that here that are might, you know, might not be directly related to revenue. And we see things like potentially costs, things that are to do with money, but not necessarily revenue. So let's take a look at how this query actually looks when visualized. So what we're going to do is grab the embedding for our query using the embedding function. And we're going to grab our retrieved embeddings as well which we get from our result. And what we're going to do is use our projection function to project both of these down to two dimensions. And then now that we've got the projections, we can visualize them and we can visualize them against the projection of the data set. I'll just copy paste this in. But it's again a scatterplot of the data set embeddings of the query embedding and of the retrieved embedding. And we're going to set the query embedding to be a read x. And we're going to see the selected or retrieved embeddings as empty circles which are green. So let's go ahead and see what that looks like. And here we are. So this is a visualization of the query and the retrieved embeddings. You can see the query here is this red x and the the green circles basically circle those data points that we actually end up retrieving. Notice that it doesn't look in the projection like these are the actually nearest neighbors. But remember we're trying to squash down many, many higher dimensions into this two dimensional representation. So it's not always going to be perfect. But the important thing is to basically look at the structure of these results. So you can see some are more outlier than others. Right. And this is actually the heart of the entire issue, the embedding model that we use to embed queries and embed our data does not have any knowledge of the task or query we're trying to answer at the time we actually retrieve the information. So the reason that a retrieval system may not perform the way that we expect is because we're asking it to perform a specific task using only a general representation, and that makes things more complicated. Let's try visualizing a couple other queries in a similar way. So here I'm just going to copy paste the whole thing. But the query now is what's the strategy around artificial intelligence that is AI. So let's run and see what results we get. And you see here that AI is mentioned in most of these documents. And this is sort of vaguely related to AI. We have a commitment to responsible AI development. But then we have, you know, something about this, you know, information about a database which is not directly related to AI. And, you know, here we're talking about mixed reality applications and metaverse, which is, you know, tangentially related to, technology investments, but not necessarily directly AI related. So let's visualize, first of all, project the same way as we did in previous query. And then we will plot. Let's take a look. Here's our query and our related results. And they're all coming from the same part of the data set. But you can see that some of the results that we get, you know, and here this this point appears to be bang on where our query landed. So it's super, super relevant. So you can see that obviously this where the query lands in this space has geometric meaning. And we're pulling in related results. But again what's related is from the general purpose embedding model not from the specific tasks that we're performing. So let's take a look at another query. What has been the investment in research and development. And this is a very general query. And it should be reflected in the annual statement. So let's see what kind of documents we get back. We see that we start with, you know, general ideas about investments. Some of it is about research and development. For example, this document research and development expenses can include a third party development and programing costs. But we see that there are also distractors in this results. So a distractor is a result that is not actually relevant to the query. And it's called a distractor. Because if you pass this information to the large language model to complete your loop, the model tends to get distracted by this information and outputs suboptimal results. And the reason this is really important is that bad behavior from the model due to distractors is very difficult to diagnose and debug, both for the user, but also for developers and engineers building these types of systems. So it's very important to make your retrieval system robust and return relevant results and no distracting results to the model. So again, let's take a look at the projection. I always find it very, very helpful to visualize and again because this this is a geometric type of data. I find visualization is a great way to develop intuitions. So there's our projection and let's plot it. So here we see the results that we're getting are a lot more spread out. And the way you can imagine this is imagine all your data is a cloud of points sitting in this high dimensional space. A query that lands inside the cloud is likely to find nearest neighbors that are sort of densely packed and close together inside the cloud, but a query that lands outside the cloud is likely to find nearest neighbors from a lot of different parts, of that cloud. So they tend to be more spread out. So geometrical intuition. So finally, I think it's really important to understand what happens when we put an irrelevant query, into our retrieval system. So let's find out what Michael Jordan has done for us lately in terms of the Microsoft annual report from 2022. Obviously, this is I would be very surprised if this was at all a relevant query. And when we look at the results, of course, none of them have anything to do with Michael Jordan. This doesn't mention him at all, and neither do any of these documents. Neither do any of these results. And that's what we should expect. But remember, if we're using a retrieval system as part of a loop, you're guaranteed to return the nearest neighbors. In this case, your context window is going to be made up entirely of distractor, which, as I mentioned earlier, can be very, very difficult to understand and debug from the application user's perspective and from the application developers perspective. So we need a way to deal with irrelevant queries as well as irrelevant results. And again, let's take a look at the projection. Let's see if there's some something we can understand. Great. We've projected and let's put. It you can see that the results about Michael Jordan are really all over the place, which I guess shouldn't surprise us given that the query is totally irrelevant to any of the data that we have in our data set. Try visualizing some of your own queries and the way that we've done here, and see how they influence the structure, of the returned results. See if you can get queries to land in different parts of the data set and see what the return results say about the information that might be contained in that part of it. In this lab, you've learned how a simple embedding space retrieval system might return distracting or irrelevant results, even for simple queries. And you've learned how to visualize this data so you can gain some intuition about why and how the results are being returned. In the next lab, we'll show you some techniques to basically improve the quality of your queries using loops by using a technique called query expansion.
14 | Lesson 3: Query Expansion
15 | Lesson Link: https://learn.deeplearning.ai/courses/advanced-retrieval-for-ai/lesson/cwewy/query-expansion
16 | The field of information retrieval has been around for a while as a subfield of natural language processing, and there's many approaches to improving the relevancy of query results. But what's new is we have powerful large language models, and we can use those to augment and enhance the queries that we send to our vector based retrieval system to get better results. Let's take a look. So the first type of query expansion we're going to talk about is called expansion with generated answers. Typically the way that this works is you take your query and you pass it over to an LM, which you prompt to generate a hypothetical or imagined answer to your query. And then you concatenate your query with the imagined answer and use that as the new query, which you pass to your retrieval system or a your database. Then you return your query results as normal. Let's take a look at how this works in practice. So the first thing that we're going to do is once again grab all the utilities that we need. And we're going to load everything we need from call and create our embedding function. And we're going to set up our OpenAI client again because we'll be using the LLN. And once again to help with visualization, we're going to use your map and project our data set so that that's all ready to go for us. Now that we're done setting up, let's take a look at expansion with generated answers. And there's a reference here to the paper which demonstrates some of the empirical results that you can get by applying this method. So to do expansion with generated answers we're going to use an LM in this case GPT. And just the same as last time we're going to prompt the model in a particular way. Let's create this function called augment query generated. And we're going to pass in a query. We're also going to pass in a model argument in this case GPT 3.5 turbo by default. And we're going to prompt the model. And in the system prompt we're going to say you're a helpful expert financial research assistant. Provide an example answer to the given question that might be found in a document like an annual report. In other words, we're pretty much asking the model to hallucinate, but we're going to use that hallucination for something useful. And in the user prompt, we're just going to pass the query as the content. And then we'll do our usual unpacking of the response. And that defines how we're going to prompt our model. Let's wire this together. Here's our original query asking was there a significant turnover in the executive team? We will generate a hypothetical answer and then we'll create our joint query, which is basically the original query prepending to a hypothetical answer. Let's take a look at what this actually looks like after we generate it. So here we see the output. We see our original query. Was there a significant turnover in the executive team and a hypothetical answer. In the past fiscal year, there was no significant turnover in the executive team. The core members of the executive team remain unchanged, etc. so let's send this query plus the hypothetical response to our retrieval system as a query, and we'll query the Croma collection the usual way and print out our results. And we're sending the joint query as the query to our retrieval system. And we're retrieving the documents and the embeddings again. So these are the documents that we get back. We see things here discussing leadership. We see how consultants and directors work together. Here we have sort of an overview of the different directors that we had in Microsoft. And we talk about the different board committees. Let's visualize this. Let's see what sort of difference this made. So to that we get our retrieved embeddings. We get the embedding for our original query. We get the embedding for our joint query. And then we project all three and plotting the projection. And we see the red is our original query. The orange box is our new query with the hypothetical answer. And we see that we get this nice cluster of results. And but most importantly, what I want to illustrate here is that using the hypothetical answer moves our query elsewhere in space, hopefully producing better results for us. So that was query expansion with the generated queries. But there's another type of query expression we can also try. This is called query expansion with multiple queries. And the way that you use this is to use the alum to generate additional queries. That might help answering the question. So what you do here is you take your original query, you pass it to the Elm. You ask the Elm to generate several new related queries to the same original query, and then you pass those new queries along with your original query to the vector database or your retrieval system that gives you results for the original and the new queries. And then you pass all of those results to the LLM to complete the loop. So let's take a look at how this works in practice. Once again the starting point is a prompt to the model. And we see here that we have a system prompt. And the system prompt is a bit more detailed. This time we take in a query which is our original query. And we ask the model. It's a helpful expert financial research assistant. The users are asking questions about an annual report. So this gives the model enough context to know what sorts of queries generate. And then you say suggest up to five additional related questions to help them find the information they need for the provided question. Suggest only short questions without compound sentences. And this. Make sure that we get simple queries. So just a variety of questions that cover different aspects of the topic. And this is very important because there are many ways to rephrase the same query. But what we're actually asking for is different. But related queries. And finally, we want to make sure that they're complete questions. They're related to the original question. And we ask some formatting output. One important thing to understand about these techniques in particular that bring an LLM into the loop of retrieval is prompt engineering becomes a concern. It's something that you have to think about. And I really recommend that you as a student play with these prompts. Once you have a chance to try to lab, see how they may change, see what different types of queries you can get the models to generate. Try different models and basically experiment not just with the retrieval system, but with the prompts you're using to augment your queries. So let's define this function and let's see what we get when we actually try this. So here's our original query. What were the most important factors that contributed to increases in revenue. So to say this is a compound query. This is you know, there could be many, many different factors. And it's not just about revenue. And let's see what augmented queries we get back. Last model and let's print a set of augmented queries as output. Great. So we got a few back. We see that. What were the most important factors that contributed to decreases in revenue? Great question. What were the sources of revenue? Also very important. How were sales and revenue distributed across the different product lines? Were there any changes in pricing strategy? Did the company acquire any new customers? So you can see that these are related questions to our original query. But they're not precisely the same and they have different meanings. That's very, very useful. And that's a great illustration of augmenting an original query through query expansion with multiple queries. So let's see how this works in practice. Once we pass these queries to our retrieval system. So first we build our set of queries. Now Croma can handle multiple queries in parallel. So what we're doing here is taking our original query in an array. And then concatenating that with our array of augmented queries. So now we have one array where each entry is a query, our original query plus the augmented queries. And we can grab the results. And again from a can do querying in batches. And let's look at the retrieve documents that we get. And one thing that's important here is because the queries are related. You might get the same document retrieved for more than one query. So what we need to do is to duplicate the retrieved documents. And that's what we do here. And finally let's just output the document so we get. So we can see now the documents that we got for each query. And these are all to do with revenue different aspects of revenue growth which is exactly what we were hoping for. We have increases in windows revenue. We can see things that are coming from other components. So for example, what were the most important factors that contributed to decreases in revenue. So we see increased sales and marketing expenses, different types of investments, different types of tax breaks. Essentially each of these augmented queries are providing us with a slightly different set of results. And let's visualize that. What did we actually get in geometric space in response to these results. So again, we'll take our original query embedding and our augmented query embeddings and project them. And the next thing we'll do is project that result embeddings. Before we do that we need to flatten the list because we have a list of embeddings per query. We just want the flat list of and returned embeddings. And then we just project them as before. Let's visualize what we get and we see that using query expansion, we're able to actually hit other related parts of the data set that our single original query may not have reached. And this gives us more of a chance to find all of the related information, especially in the in the context of more complex queries, which require more and different types of information to answer. So here we see that the read x is our original query, the orange XS are the augmented, the new queries generated for us by the alum and I. Once again, the green circles represent the results that we actually returned by the retrieval system to the model. One way to think about this is that a single query turns into a single point in embedding space, and a single point in embedding space likely doesn't contain all of the information that you need to answer. More complex query like this one. So using this form of query expansion where we generate multiple related queries using an LLM gives us a better chance of capturing all of the related information. The downside of this, of course, is now we have a lot more results than we had originally, and we're not sure if and which of these results are actually relevant to our query. In the next lab, using cross encoder Reranking, we have a technique that allows us to actually score the relevancy of all the returned results and use only the ones we feel match our original query. And I'll demonstrate that in the next lab. In this lab, I recommend that you try playing around with the query expansion prompts. Try your own queries and see the types of results you get by asking different types of questions about the Microsoft Annual report.
17 | Lesson 4: Cross Encoder Re Ranking
18 | Lesson Link: https://learn.deeplearning.ai/courses/advanced-retrieval-for-ai/lesson/nusf7/cross-encoder-re-ranking
19 | In the last lesson, we looked at how to improve retrieval results by augmenting the query we sent with an LM. In this lesson, we're going to use a technique called cross encoder reranking to score the relevancy or about retrieve results for the query that we sent. Let's dig in Reranking as a way to order results and score them according to their relevancy to a particular query. So let's take a look at how this works underneath in Reranking. After you retrieve results for a particular query, you pass these results along with your query to a Reranking model. This allows you to rerank the output so the most relevant results have the highest rank. Another way to think about this is your Reranking model scores. Each of the results conditioned on the query, and those with the highest score are the most relevant. Then you can just select the top ranking results as the most relevant to your particular query. So let's take a look at how to do this in practice. First we import our helper functions as before. And we load the data into. So one use of Reranking is to get more information out of the long tail of query results. So let's take a look at this query that we've already covered once before, which is what has been the investment in research and development. And usually we've been asking for five results return for our particular query. But now we're going to ask for ten. That means we're going to get a longer tail of possibly useful results. And again we're going to include documents and embeddings. So let's retrieve the documents and and take a look at what we get. We see that we get the same first five results as before because retrieval is deterministic. But we also have five new results which might have relevant information to our question. The trick is to figure out which of these results are actually relevant to our specific query, instead of just being the nearest neighbors in embedding space. And the way we do that is through using cross encoder Reranking. So we're going to use the sentence transformer cross encoder. And we're going to instantiate it with a particular model. So what is a cross encoder model. Sentence transformers are made up of two kinds of models. There's something called a BI encoder where a BI encoder encodes queries separately. And then we can use the output of those bi encoders to perform cosine similarity and find the nearest neighbors. In contrast, a Bert cross encoder takes both our query and our document and passes it through a classifier which outputs a score. And in this way, we can use our cross encoder to score our retrieve results by passing our query and each retrieve document and scoring them using the cross encoder. We can use the cross encoder by passing in the original query and each one of the retrieved documents, and using the resulting score as a relevancy or ranking score for our retrieved results. So we've instantiated our cross encoder, and the first thing we're going to do is create pairs. The pairs consist of our query and each doc in our retrieve documents. And we're just going to ask the cross encoder to score each pair. So let's print out our scores. And while we see that the first two documents have high scores for our query, we notice first of all, that the second retrieve document is actually a much higher score than our first one. And also some documents in the longer tail of retrieved results have higher scores than some of the documents in the first five. So what would that look like if we were to reorder our documents according to score? We see that the second document is now ranked first. First document is ranked second. And something in the long tail actually makes it into the top five. And in fact, the top five ranked results contain results that originally the sixth and seventh results, while the fourth and fifth results are actually ranked by lower. So in this way, we've used the cross encoder and the score that it produces to rerank our results. And now if we were to cut to this top five, we'd see that the results should be much more relevant than what we had before, because we've mined more of the long tail for information that's actually relevant to our question. Now, you might already be see where I'm going with this, but given the number of results that we get with query expansion and the way in which each generated query addresses a different part of the complex question, we can use the crushing further reranking technique to actually get all of the best results for the original query from the augmented expanded queries, instead of just sending all of them to the left. And here's how we do that. So from the previous lab, this is just our original query. And the generated queries. And I've saved them into text here for you. And then we do the same thing. We concatenate the original and generated queries together and we retrieve the results. And then as last time we did duplicate the retrieved results. And now we create Paris, just as we did in the previous example, where we make pairs of the original query and each retrieve document. In this way, we can compute the relevance of the retrieved results for the augmented queries to the original query and select among them the five best that we actually want to pass to the left. So let's create those pairs. And let's score them. And one great thing about using a cross encoder model like this one is it's extremely lightweight and runs completely locally. So here are the scores of all of our retrieved results. And we can use these scores to order our results and give us a new ordering. And then we can pass the top five of this new results to the LLM and get the most relevant information from this long tail of queries that we got from query expansion and the retrieved results for our augmented queries. So in this lab, we learned how to use a cross encoder as a Reranking model. And we've seen how we can apply Reranking both to get more out of the long tail of a single query, as well as to filter the results of an augmented expanded query to only the results relevant to the original query itself. This is a really powerful technique and worth experimenting with some more, and it's a good idea to try to understand and get an intuition for how the reranking score might change depending on query, even when the result that your retrieval system is giving you are the same. This is because the cross encoder rear anchor can emphasize different parts of the query than the embedding model does. And so the ranking that it provides is much more conditional on the specific query than just what is naively returned by the retrieval system itself. In the next lab, we'll talk about query adapters. Query adapters are a way to directly alter or augment the query, embedding itself using user feedback or other types of data to get better query results.
20 | Lesson 5: Embedding Adaptors
21 | Lesson Link: https://learn.deeplearning.ai/courses/advanced-retrieval-for-ai/lesson/s5dr4/embedding-adaptors
22 | In the last couple of lessons, we've looked at how we can use query augmentation and cross encoder reranking to improve retrieval results. In this lesson, I'm going to show you how we can use user feedback about the relevancy of retrieved results to automatically improve the performance of the retrieval system using a technique called embedding adapters. Let me show you how it works. Getting adapters are a way to alter the embedding of a query directly in order to produce better retrieval results. In effect, we insert an additional stage in the retrieval system called the embedding adapter, which happens after the embedding model. But before we retrieve the most relevant results, we train the embedding adapter using user feedback on the relevancy of our retrieved results for a set of queries. Let's get into it. So the first thing we do is grab our helper functions as before. One special thing here is we're going to need torch because we're going to effectively train a model, but a very lightweight one. And we create our embedding function and load everything into. And again we project all of our data. So the first thing we need for this approach is a data set. We don't have one ready because we haven't really had users using our application. But we can use a model to generate a data set for us. And once again, this is all about creating right prompt. So we're going to use GPT again. And essentially we're prompting the model as an expert helpful financial research assistant. And it should suggest 10 to 15 short questions that are important to ask when analyzing an annual report and with some guidelines about what the output should be like. And this will generate some queries that users might actually have run against our system. So let's ask how to generate this query. We see that these are fairly reasonable questions to ask about any company's financial statements. So we're going to get the results from Croma and what we're going to do. And we'll get the retrieve documents associated with the results. And what we're going to do is also ask the model to evaluate the results. In a real system, you can easily ask our users to give a thumbs up or thumbs down on the generated output, and then reference that with the retrieve results to give a signal about what results were actually relevant and which ones. In this case, we don't quite have that, but we can use a model to evaluate the relevancy of the retrieved results for each query. And again, this is just about prompting the model. So we're going to ask our helpful expert financial assistant to tell us whether a given statement is relevant to the given query. And we're going to ask it to output only yes or no. And then we're going to essentially transform yeses to ones and those two negative ones. And I'll explain why in just a minute. That's the prompt. And then what we're going to do is we are going to get our retrieved embeddings and our query embeddings. And we're going to start making a data set to train our embedding adapter. And the way we're going to do it is like this. We're going to have our doctor query embeddings, a doctor doc embeddings and our adapter labels. Now the adapter prefix just means we're going to use this in a data set. They're not they're not special in any way. They're just the embeddings of our queries and the embeddings for our documents. The labels we're going to get from our evaluation model. The label is going to be plus 1 or -1, depending on whether the document is relevant or not to the given query. So we're just going to loop over everything to create these triples. So the model is performing an evaluation for us. Now it's no mistake that our labels are plus one and minus one. Because what we're going to do when we're training our embedding adopter model is use these values as our loss function for cosine distance when two vectors are identical. The cosine similarity between them is one. When two vectors are opposite, the cosine similarity between them is negative one. In other words, we want relevant results to point in the same direction as vectors, and we want irrelevant results to point in the opposite direction from a given query. And this is the model that we're going to train. That's exactly what it's going to try to do. All right let's check out the length of our data set. Great. 150. So that's 15 queries with ten results each for each one labeled for relevancy. So the next thing we need to do because we're using torch to train our embedding adapter, is we need to transform our data set into a torch tensor data set. So we're just going to do some data manipulation here to transform these into torch tensor types. And finally we're going to pack everything into a torch data set. So let's set up our embedding. Adapt our model. The first thing is to set up the model itself. And the model is fairly straightforward. It takes as input a query embedding, a document embedding and an adapter matrix. We compute an updated query embedding by multiplying our original query embedding by the adapter matrix, and then we compute the cosine similarity between our updated query embedding and our document embedding. Next, let's define our loss function. Again our loss takes a query embedding, document embedding adapter matrix and label. And we run the model to compute the cosine similarity. And we compute the mean squared error between the cosine similarity and the label. And you'll notice again that the plus one label means that the cosine similarity says that the vectors are pointing in the same direction, and a negative one label means they should be pointing in the opposite direction. In this way, we want our queries to be pointing in the same direction as relevant documents and in the opposite direction to irrelevant documents. And this is what we're training our adapter matrix to do. We initialize our adapter matrix for training. You might recognize this is very similar to a linear layer, in a traditional neural network. And that's really all we're doing. Next let's set up our training loop. We set our minimum loss and our best matrix as things to keep track of. Let's train for 100 epochs for each query embedding document embedding a label in our Torch data set. We compute our loss if the loss that we computed is better than our previous loss. We'll keep track of. That is the best matrix so far. And then we backpropagate. And let's run our training loop. And you can see it's very very fast because again this is exactly the same thing is trained as if we were training a single linear layer of a traditional neural network. So let's take a look at the best loss that we got. This is pretty good. A loss of 0.5 is pretty good. It means we've got pretty much a halfway improvement in terms of where we started from. So one thing we'd like to take a look at is how the adapter matrix influences our query vector. To do that, we can construct a test vector consisting of all the ones, and we can multiply that test vector by our best matrix. And what this will tell us is which dimensions of our vectors get scaled by what amount. You can think of an embedding adapter, a stretching space, and squeezing space for the dimensions which are most relevant to the particular queries that we have, while reducing the dimensions that are not relevant to our query. You'll also notice that it can reverse dimensions. So let's plot what that looks like. And here you can see how each dimension of our test vector, which consists only of ones, has been stretched and squeezed. Some have been elongated a lot, while others have been made to be almost zero. And so what this means is our embedding adopter has basically decided, okay, these dimensions are more relevant. These are less relevant. These are actually opposite to the things that we want to find. And these things are actually more relevant to the things that we want to find. Now let's take a look at what effect this actually has on our queries. So let's do as we did before. We'll take our generated queries and embed them. And let's compute our also our adapted query embeddings. And then we project them. Now let's plot what we get against our dataset. And as you can see our original queries were quite scattered around. But our new queries concentrate on a certain part of the data set which is most relevant to our queries. You can see how the read queries have been adapted through the embedding out there to transform them into the green queries, to push them into particular part of the space. So, as you can see, an embedding adapter is a simple but powerful technique for customizing query embeddings to your specific application. In order to make this work. You need to collect a data set either a synthetic one like the one we've generated here, or else one that's based on user data. User data usually works best because it actually means that people are using your retrieval system for their specific tasks. Again, because this approach involves prompting and because it involves the use of a large language model, it's worth experimenting with the prompts. And it's also worth experimenting with different initializations of the adapter matrix. Even maybe consider using a full lightweight neural network and training that instead of a simple matrix instead. You might want to tune the hyperparameters of the embedding, adapt their training process, or you might want to collect more specific data and try this out with, a specific application in mind, rather than our very general one of trying to understand a financial statement. In the next lesson, we'll cover some other techniques which are just now emerging from research to improve embedding based retrieval systems.
23 | Lesson 6: Other Techniques
24 | Lesson Link: https://learn.deeplearning.ai/courses/advanced-retrieval-for-ai/lesson/l1uaj/other-techniques
25 | Embeddings. Based retrieval is still a very active area of research, and there's a lot of other techniques that you should be aware of. For example, you can fine tune the embedding model directly using the same type of data as we used in the embeddings adapters lab. Additionally, recently there's been some really good results published in fine tuning the limits self to expect retrieved results and reason about them. You can see some of the papers highlighted here. Additionally, you could experiment with a more complicated embedding adapter model using a full blown neural network or even a transformer layer. Similarly, you can use a more complex relevance modeling model rather than just using the cross encoder as we described in the lab. And finally, an often overlooked piece is that the quality of retrieve results often depends on the way that your data is chunked before it's stored in the retrieval system itself. There's a lot of experimentation going on right now about using deep models, including Transformers for optimal and intelligent chunking. And that wraps up the course. In this course, we covered the basics of retrieval, augmented generation using embedding space retrieval. We looked at how we can use our LMS to augment and enhance our queries to produce better retrieval results. We looked at how we can use a cross encoder model for Reranking to score the retrieved results for relevancy, and we looked at how we can train an embedding adapter using data from human feedback about relevancy to improve our query results. Finally, we covered some of the most exciting work that's ongoing right now in the research literature around improving retrieval for AI applications. Thanks for joining the course, and we're really looking forward to seeing what you built.
26 |
--------------------------------------------------------------------------------
/docs/course4_script.txt:
--------------------------------------------------------------------------------
1 | Course Title: Prompt Compression and Query Optimization
2 | Course Link: https://www.deeplearning.ai/short-courses/prompt-compression-and-query-optimization/
3 | Course Instructor: Richmond Alake
4 |
5 | Lesson 0: Introduction
6 | Lesson Link: https://learn.deeplearning.ai/courses/prompt-compression-and-query-optimization/lesson/c14k8/introduction
7 | Welcome to Prompt Compression and Query Optimization. Built in partnership with MongoDB and taught by Richmond Alake. Richmond is a developer advocate at MongoDB and has worked as a machine learning architect and taught AI and ML for many years. Thanks, Andrew. This course shows you how to combine features of a mature, established database with vector search to reduce the cost of serving a large RAG application. Say you're building a conversational RAG application that helps users select a rental property. A user might enter a text query for one level ranch on a quiet street. You can use semantic search to find a close match to the user description. Using an embedding of the user requests and searching a vector database for homes with descriptions that match. But the user may also have hard requirements like three bedrooms, two bathrooms, and maybe no swimming pool. These are better handled with a more traditional retrieval by selecting data based on fields in the database and explicitly store the number of bedrooms, bathrooms, and so on. In this course, you learn to use the best of both worlds, a traditional database with an added vector index. In RAG applications to retrieve results that provide an LLM for final processing. If the retrieve context is very long, this results in a very long prompt and can thus be costly where retrieval to return, say 10,000 tokens. If you were to run a rental comparison website to search, say, a million queries per day, and if LLM input tokens cost $10 per million tokens, you could be spending over $36 million a year. So, to help you reduce costs, this course, will also cover ways to keep the retrieved results as small and relevant as possible. Thanks, Andrew. Let me describe some of the techniques you will learn. Let's consider your rental app filtering on the number of bedrooms or bathrooms can be done with a pre-filter or post-filter. Efficient pre-filter is done in the database index creation stage. You build a new index of entries that match common queries. So for example, if you know you frequently get queries for bedroom units, you can build an index that includes the bedroom field. So that's pre-filtering. In contrast, post filtering is done often a vector search query is performed where you then apply a filter to this result to select the sub set matching the required condition. Large scale applications may use both of these techniques simultaneously. Another technique to minimize the size of the output is something called projection, which selects a subset of the fields returned from a query. For example, out of 15 fields of a potential rental, you may want to return only three of them. Name, number of bedrooms, and price. Now, you could implement all of this operation directly in your application, but the database can optimize all this operation for performance and enforce role-based access control. So they are best accomplished there. And another powerful technique is reranking the results of a search. For example, after using the text embeddings of the renter description to perform a semantic search, you can rerank the results based on other data fields such as average star rating or number of ratings, to move the more desired results higher up the list of results. In order to then generate better context for the LLM. One final technique is prompt compression. If the retrieve information is very lengthy, seeking all this context into an LLM prompt results in a very long, prompt length, which is expensive to process. To reduce this costs, you can use a small, low cost LLM fine tuned to compress prompts before sending them to the final LLM. There are many opportunities to improve relevance and save costs. Thank you Andrew. You will learn all these techniques in the next few lesson. You will start this course by implementing a vanilla vector search and end by implementing prompts compression. Many people have worked to create this course from MongoDB. I'd like to thank Apoorva Joshi, Pavel Duchovny, Prakul Agarwal, Jesse Hall, Rita Rodrigues, Henry Weller and Shubham Ranjan, and also Esmaeil Gargari from DeepLearning.AI had also contributed to this course. I hope you enjoy this course. Please go to the next video and let's dive in.
8 | Lesson 1: Vanilla Vector Search
9 | Lesson Link: https://learn.deeplearning.ai/courses/prompt-compression-and-query-optimization/lesson/y8g9n/vanilla-vector-search
10 | This lesson covers vector search and expands on RAG implementation. You explore MongoDB and pydantic a Python library crucial for data integrity. Understanding these tools will elevate the quality of your AI projects. Let's dive in. You understand that there are vast amount of data on the internet right now, and there are few ways to compare the similarity or how closely one data point is to another. A common method is text search, where you match a query keyword with a parts of the content of a data point to compute a match. This is information retrieval at the basic sense, where you input a keyword or a search term, match against several data points, and detect if the keyword is in the content. And now you are about to learn how to retrieve data based on their context or meaning. The first step is to gather data. This can be structured data like data organizing tables or spreadsheet with defined columns or unstructured data such as audio and image data. The next step involves passing the data as input to an embedding model. The output of an embedding model is a vector. At this point, you can say that the initial data has been vectorized, and you're left with the numerical representation of the data that captures the context and semantics of the data. This is referred to as vector embedding. In a high dimensional space, referred to as a vector space, you can compare the distance between two embedding vectors or more to get an indication of how closely they are similar in semantics or context. So far you understand vector search and information retrieval technique that uses numerical representation of data known as vector embeddings to search and retrieve information. You also understand that traditionally, information retrieval relies on keyword matching, which searches for direct matches between query text and the text within the data set. However, vector search makes use of this embeddings to enable advanced functionality such as semantic search, which understands the context of the query. Recommendation systems that predicts user preference. And Retrieval Augmented Generation, or RAG, which provides additional context for LLM inputs. These capabilities make vector search a powerful tool in various AI applications. Once data, both structured and unstructured, has been collected and encoded into vector embeddings, there is a requirement to store the vectorized data into a specialized data store, referred to as a vector database. Within a vector database, to ensure efficient retrieval of vector data based on vector search queries, it is best practice to index the vector data. A vector search index is a specialized structure that optimizes the storage and retrieval of vector embeddings, allowing for efficient similarity searches. So when a vector search operation is performed, the index facilitates the efficient matching of the query vector against the data set, reducing the time needed to find the most similar vectors. And that takes you down the road of search, specifically vector search in retrieval augmented generation system. Retrieval augmented generation or RAG, is a system design pattern that leverages information retrieval techniques, including vector search and foundation models, to provide accurate and relevant response to user queries. RAG achieved this by retrieving semantically similar data to supplement user queries with additional context, and then combining the retrieved information with the original query as input into large language models. For example, a typical process using a chat interface would be you enter your chat and then you get a response from the LLM. This is not the ideal process, as this doesn't use any relevant data. The ideal process would be, with the input to the LLM, then you add in relevant domain specific data, and the large language model can provide relevant and context-aware response to your query. Now that you have an understanding of RAG, let's get an overview of the key benefits of RAG design pattern for LLM applications. Building AI application that leverages RAG system design pattern provides a number of benefits, such as grounding the LLM response in relevant and up-to-date information, which will reduce the chances of hallucinations when the LLM essentially provides wrong information or irrelevant information. With retrieval augmented generation, you also have the benefit of reducing the amount of information that is passed as input into the LLM. This can reduce the context you pass into the context window. With RAG, you also removed the need for fine tuning LLMs in some scenario, but more specifically using retrieval augmented generation, you can utilize your own private data or domain-specific data to ensure that LLM responses meet your specific requirements and needs. Now that you know, at the LLM, give better answers when supplemented with relevant context. You may wonder where and how to store this data. You may also ask, "How do I implement vector search for information retrieval in the first place?" That's where MongoDB comes in. MongoDB is a developer data platform that offers a NoSQL database with vector such functionalities. In your AI applications, MongoDB can act as a storage solution for vector data acting as a vector database. MongoDB offers even more functionality to act as a data store for operational and transaction data, making it a robust solution as a memory provider for LLM and AI applications, which include RAG and agentic systems. You're likely familiar with traditional relational database. Let's use store in data on a house to illustrate how a relational database works. In a typical relational database, you might have the information of the house, such as the number of rooms and bathrooms on one table and the address information of the house in another. With the document model, you model data based on the interaction that happens on the application and not the other way around. For what is a document in MongoDB, a document is a basic unit of data that is similar to json. Each document is a set of key-value pairs, which is the MongoDB equivalent of a row in a relational database. Let's see this in the house example we talked about earlier where we had the house details and its address attributes. In this example we have all the attributes allocated to a house in one document, including its address. This is an example of a document in MongoDB. Documents are dynamic, meaning they can contain varied fields and structures within the same collection. And the collection in a non-relational database is similar to a table in a relational database. The document model uses a json schema, which is a core data model across layers of the tech stack. For example, json helps transfer the data between website parts and Rest APIs in the application layer and used for function call in in tool definition in the model layer. When implementing an agent. MongoDB enables flexible field and valid data storage with the ability to store different data types. To ensure your documents are structured properly. You should consider data modeling. Data modeling involves designing the structure of documents and collection to effectively represent and organize data. It's about planning how to store and link data across documents to optimize for performance, scalability, and specific data access patterns of your application. At times, the layout of components of your application is dictated by the structure and format of your data in a database. In the diagram here, this is represented by the directional arrow coming from the data to the application layer. This represents implementing the application layer based on the information in the data layer. But ideally, you want to start with the needs of your application first and not the data itself. You have to ask "How would I access my data?" And that should determine how you will model the structure of your data. MongoDB enables you to use a familiar understanding of pipelines, which is present in data processing or machine learning concepts. You can apply the concepts of pipelines to ideas within a database layer. When conducting queries using MongoDB, you construct an aggregation pipeline. You can think of an aggregation pipeline as a sequence of data processing stages, where each stage transforms the data as it passes through. This process allows for complex query composition within MongoDB, as we have various stages of data transformation occurring within the pipeline. Here's an example of an aggregation pipeline query. By the way, a query is just a fancy way of describing how to tell the database to produce the specific information you're looking for. Let's say you're managing data from a social media application with a collection of user posts. You want to find the most popular posts defined by the number of likes in January 2021. And perhaps you're interested in summarizing the average number of comments and likes per posts by category. This aggregation pipeline fills this post from January 2021, groups them by category, calculates the average likes and comments, and sorts the results by the average likes in descending order. By using the aggregation pipeline, you can leverage your understanding of sequential operation from if a machine learning and AI pipelines, and apply a similar logic to managing and analyzing data in MongoDB, making complex queries quite understandable and manageable. In AI application, there is a need for data validation and ensuring that data conforms to a certain model. This reduces the likelihood of having errors in production system. Pydantic is a Python library used for data validation, modeling and management. Pydantic offers features that enables the creation of data schemas that include a definition of the object and its properties. Pydantic also ensures that data conforms to defined schemas, data type, formats and constraints. If a data schema doesn't meet the validation criteria, Pydantic handles the error by raising an exception that details the specific validation issues. Before we dive into coding, let's review the data set. It consists of 5000 Airbnb listing hosted on HuggingFace featuring details like address, description, transportation reviews and comments. For this course, you will use it to build an Airbnb listing recommendation system using RAG techniques. Each record or data point includes image embeddings of the listing, photos and text embeddings from the content of the space attribute. The information and the space attribute has been processed by the OpenAI text-embedding-ada-002 model. Here are the steps we're going to take in the coding section for this lesson. You are going to load the data from HuggingFace. Then you will set up a database connection to access the database and the collection, which you will then insert data or ingest data into the collection. And then you will conduct a vector search query using a query embedding and the embedding within the collection. The last step will handle the user query and visualize any responses. Let's dive in. Before we get to the steps that we outlined in the slides, let's see what you would build in this lesson. In this lesson, you will be building a RAG recommendation system that uses vector search to pull relevant results from a vector database to add as additional context to an LLM. As you can see on the screen. You will also observe the execution time of the vector search query and the user question under system response. The system response is the response from the LLM, and it will include a recommendation listing from the data set to have provided it as additional context And the reason for choosing this recommendation. You will also observe a table of the attributes of the data that was used as additional context, which will include the name, accommodates, address. And this will be shown for all information retrieved from the vector search query. Let's get started. These are libraries you use for this notebook which are pre-installed and available for you on the Deep Learning platform. Here, you will import the OS model and load the environment variable which you have loaded within your development environment. We will load the Mongo URI and the Open AI API key. These have been previously done for you on the development environment. The first step is to load the data set. Here, we'll import the load data set module from the data sets library from HuggingFace, which allows us to access data set from the HuggingFace platform by specifying the path. You will also import the pandas library and specify as PD, which allows conduct data modification and analysis. The first step is to call the load data set function by passing it the path to the data set. In this case, this is the Airbnb embedding data set we spoke about earlier, that contains the text embeddings. You will set the streaming to true and use the training partition of this data set. The output of this operation will be assigned to the variable data set. By calling take on the data set object and specifying the number of data points you want to extract from the data set. You can load a specified amount of data points into your environment. The next line converts the data sets into a pandas dataframe. This allows for analysis and data modifications. The final step is to view the first data points in this data set. As you can see on the screen, we can visualize the first five data points and their attributes, including the values. Pause the video here and take some time to familiarize yourself with the values of each data points. To continue with the visualization of our data set. And as data points, we will visualize the attributes of each data point. Here, you can see the various attributes that are captured in each data point within the data set, including the text embeddings. The next step is to conduct document modeling using Pydantic. First we'll import several modules from Pydantic and also the date time module from Python. In this lesson, you explore the full extent of the code, but in next lessons, you will shorten the code with a new tools function where all the extensive code will be placed in, and you can call within the notebooks. In the modeling step, the first step is to create a class host that essentially represents or defines a creator of a listing. We have attributes such as the host ID, the name, location, and response time. The next models, we will create are the location and the address. These are used to model the location and address data in our data set. And ensure that they are there conform to the type and to the data presence. You would then create another model for the review. This model will essentially hold the date of a review, the lesson ID assigned to the review, and review ID and name and any comments. The final model you will create is the parent model. This will be the listing model that will assign all the previously created model to attributes within this model. This model will also contain its own attributes such as name, summary, description, transit, and other attributes. This is the key model that holds the information of a listing an Airbnb listing. Now that you have created the models for each data point in a data set to conform to, you will now convert them into the appropriate data types. This line converts each data point into a Python dictionary and assigns it to a variable called records. Now, records holds all your listings from the data set. To ensure there are no null values, you will conduct a sanity check and replace any null values with a non. For the final step in the data modeling process, you will convert each listing data points into a dictionary and assign it to a listing variable. You'll also print out the first instance or element within the listings data sets to observe the attributes for each listing. As you can see on the screen, each listing has a name, summary, space, and other attributes. Pause the video here and take some time to familiarize yourself with the attributes. The next step is to create your database and connect to your database cluster. This is a crucial step. For the database creation and connection step. The first step is to import the libraries. You will import the libraries Mongo clients from pymongo and search index model from PyMongo operations module. Mongo client to allow us to create a client instance and a search index model will allow us to define a vector search index in the appropriate format. For the next step, you assign the database and collection name. The database will be called airbnb_data_set, which should be assigned to the variable database name. The collection will be called Listings Review, which you'll be assigned to the variable collection name. Now, you define a function called Get Mongo client, which takes in the Mongo URIs string. This is a string that represents a connection to your cluster. the Get Mongo client function uses the Mongo client constructor taken in the Mongo URI as it's argument and the app name to create an object that represents a connection to the database cluster. Once a successful connection is made, this function will return the client object. Once you've created a get Mongo client function. In the next cell you will use the function, but first conduct a sanity check to ensure you have the Mongo URI within your development environment. You will pass the Mongo URI into the Get Mongo client function. The results from the Get Mongo client will be assigned to a variable called Mongo client. The Mongo client object provides you with the method Get database, which provides a database object which allows you to access the collection by calling the method get collection on the database object. Running the cell will show a successful Mongodb connection. The last step in the database creation in connection stage, is to clean any existing collection. The first time you run this function, the result of this will be zero because the collection has just been created. In future lessons, you will need to clean the collection and you will see records being deleted. The next step is a data ingestion step. For data ingestion, MongoDB provides a function that makes ingesting data into a MongoDB collection a trivial process. Simply call the insert many function on the collection object and pass in the list and collection. Once this cell is completed, you should get a successful indicator that the data ingestion has been completed. The next step is to create the vector search index. This is a crucial step. Remember, the index allows for efficient information retrieval from the vector database. First assign to the variable text embedding field name, the name text embeddings. Text embeddings is the field that holds the vector embedding of the spaces attribute within each document in the collection. Next, assign to the variable vector search index name text the string vector index text. Vector index text is the name of your vector search index, and this will be referenced every time you make a vector search query. Now, you can use the search index model to create an appropriate definition of the vector search index and assign it to the variable vector search index model. In the cell you will be creating your vector search index using the search index model. The result of this function will be assigned to the variable vector search index model. The search index model constructor takes into his argument a definition of your vector search index. The mapping specifies how the fields are going to be indexed within the database. The dynamic field indicates to the database to index new fields that appear in the documents. The fields attribute corresponds to the indication of which field in a document holds the vector embedding. The text embedded field name is the variable that holds the string representation of the text embeddings The dimension holds the value, which indicates the size of a single vector embeddings within our documents. The field similarity indicates the distance function algorithm used to compute the similarities between two vectors. The type knnvector indicates to the database the type of the data stored is a vector. The last argument passed into the constructor is the name. This will allow the database to identify the vector search index created by the given name Vector index text. In the next cell, you conduct a check to ensure that the vector search index name selected doesn't already exist. This is good practice. Before creating any vector search index definitions. Now, you call the create search index function on the collection object to create the vector search index. This is conducted if the index doesn't already exist. You will observe in the screen an indication that the index was created successfully. Before moving on to the next cell, you can wait a minute to allow the vector index to be initialized. The final step in this process is to define a function called get embedding. The function get embedding takes in a text, which is the user query that's entered into the recommendation engine. We conduct a sanity check to ensure the text entered into the get embedding function is a string, and then you call the embeddings dot create function from the OpenAI client to generate an embedding for a single data point. The get embedding function returns a numerical vector representation of the text that was passed into the function. The next step is to compose a vector search query. You start by defining a function called vector search. This function will take in the user query, the database object, collection object, and has a vector index defined. This vector index argument has a default value, which corresponds to the name of the vector index created earlier. The first process inside the vector search function is to transform the user query into a numerical vector representation and assign it to the query embedding variable. You conduct a sanity check to ensure the query embedding is not empty before moving on to other processes within this function. The next step is to define the vector search stage. This will be the stage responsible for conducting the vector search operation that compares vector embeddings, and computes the distance. Assigned to a variable called vector search stage a json document that represents the vector search index query you are constructing. In MongoDB, operators are represented using the dialog command. And operator here is a vector search stage, so this document represents a query for a vector search operation. The index field points to the name of the vector index to utilize for the query. The query vector takes in the query embedding, which is a user query which should be used to compute the distance with other candidate vectors from the database. The part field specifies the field where the vector embedding is held within the documents. The number of candidates, or the amount of documents you want the vector search operation to consider. The limit field constrains the vector operation output to just 20 results. The next step is to define our pipeline. In MongoDB, a pipeline can be constructed by using a Python list and passing in the stages that are defined earlier. To create our pipeline for vector search function, you have the variable pipeline, which takes in a list that includes the vector search stage. The next step is to execute the aggregation pipeline. To do this, call the aggregate function on the collection object and pass in the pipeline created previously. The result of this pipeline will be assigned to the variable results. For the final step of this vector search function, you will compute how long it takes for the vector search operation to complete a millisecond. This is done by accessing and passing into the command method on the database object, the pipeline, the collection name, and an indicator to explain the execution of the command passed into the pipeline. This will provide you with an object that includes the execution stats of the vector search operation. The next lines extract the key information and prints the information onto the notebook. The final step in the vector search function is to return the list of the results. This is a last step of this lesson where you will handle the user query and conduct all the function you've defined earlier. To ensure the search results or the document returned from the database meets a specific format, you use Pydantic to define a search result. It's a model. Each result will take on the name, accommodate, address, summary, and other specified attributes. To handle user queries you define a function called handle user query. Handle user query function would take into user query, the database object, and the collection object as its argument. And now you get to use the vector search function. Call the vector search function, pass in the query the database object and the collection object, and assigning the results to a variable called Get Knowledge. Get knowledge will hold the list of the documents retrieved through the vector search operation. You conduct a sanity check to ensure get knowledge is not empty. Once you've obtained the search results from vector search operation held in the get knowledge variable, you will convert them and ensure they meet the specified model defined in the search result item. The next step is to convert the search results into data frames. This allows for efficient modification of the search results. In this step you will pause the query and the search results to the LLM. In this course, you were using GPT-3.5 turbo as the LLM for the RAG system. Here you are specifying to the system that it's an Airbnb listing recommendation system and passing in the query along with additional context held in the search results. Data frame variable. The following step extracts the response from the LLM. You extract the response from the LLM and assign it to a variable called system response. Next, you will print out the user query and the system response for visualization and observation into the process occurring. The final step here is to display the search result as a table, which holds the additional context passed into the LLM as input. The handle user query function will return the system response. This is the final step for this lesson where you assign a string, representing a query to a variable called query, and pass the query into the handle user function along with the database and collection object. The query you are using for this lesson and course is specifically one that indicates to the system to recommend an Airbnb listing that is warm and friendly and not too far from restaurants. Now, you run to handle user query. Here you can see that the vector search operation took 0.02 milliseconds, which is very fast. From the print statement, you can identify the query, passing into the vector search operation that was then embedded, and a vector search was conducted. The system response can also be observed. It's recommended the coziness heart of Plateau in Canada, and it's provided a reason why. Pause the video here to observe the reason. In this lesson, you learned how to load your data into a development environment, model your data using Pydantic conduct data ingestion into a connected MongoDB database and perform a vector search operation. You essentially built a RAG pipeline. Next lesson, you will explore adding filtering to your vector search operation, including pre and post-filtering. See you in the next lesson!
11 | Lesson 2: Filtering With Metadata
12 | Lesson Link: https://learn.deeplearning.ai/courses/prompt-compression-and-query-optimization/lesson/gj6ye/filtering-with-metadata-
13 | In this lesson, you will develop a multi-stage MongoDB aggregation pipeline. You will discover how to use metadata to refine and limit the search results returned from database operation, enhancing efficiency and relevancy. All right, let's have some fun. Metadata is simply additional information about a specific data. It's meant to supplement the key data point and provide more context. For example, imagine an image of the Mona Lisa. The image is a key data, which can be the vector embedding of the image itself. Other data such as the title, artist, name, location, and more can be added to the image embedded as metadata. Both the key data and the accompanying metadata can be stored in MongoDB as a single document. By pairing the vector embedded with metadata, the overall image data becomes more informative and useful. Let's understand how metadata can be useful within LLM applications and RAG design pattern, Metadata can be used to add additional context to the embedding data for improved relevance and understanding. Metadata also improves the relevance of vector search queries by enabling filtering and sorting based on data attributed to the embedding data. This can reduce the scope of the vector search query operation or its result. In order to streamline the results of vector search operation to improve relevance, you will use metadata within filtering stages compose together with a vector search stage in an aggregation pipeline. You will use MongoDB's aggregation pipeline to create composable queries, which makes it easier to think of and implement complex queries. Creating an aggregation pipeline with filtering stages enables database operation to produce more relevant results. Let's see how. An example of one of the filtering technique is post-filtering. This is where a vector search stage is conducted and the results are reduced based on certain criteria are referred to as filter. Imagine you have a user query that has certain keywords such as seaside and restaurant, but it also contains constraints such as specified quantities of room and capacity requirement. You can examine how a post filtering process will occur by first starting with the full data set, then applying a vector search operation on the full data set to get results that are semantically similar to the user query. Then, in a post-filtering operation, you apply the filter stage after the vector search stage to further reduce the return result based on a specified criteria. Now, let's see another filtering technique known as pre-filtering. Using pre-filtering technique within vector search can produce different results at the end of the database operation. Let's see how. Start with using the same user query and the same data set. But this time the filter operation or stage is applied to the data set to remove results that don't meet the filter criteria. After this initial reduction, then vector search is applied on the filter stage result. The key difference you takeaway is that pre-filtering involves applying a filter to the data set before conducting the vector search. This approach reduces a subset of data that the vector search will process for similarity measurement. But one key takeaway from post-filtering and the pre-filtering technique is that post-filtering might reduce the amount of document you use for semantic similarity with a user query vector. Which means there could be potential loss of information or records that could be semantically similar to the user query, but are not returned as a result of the filter stage. Now let's see what you're going to build in the coding section. You're going to set a simple RAG pipeline, but then you add a post-filtering stage and observe the result. Which you would then under the user query accordingly. You then, will add a pre-filter stage to observe the different result from a post-filter to a pre-filter, and you will handle the user query accordingly. Let's code. You'll start by importing custom details. This module has been created to streamline some of the code you used in lesson one. You'll notice where custom_utils methods are used and an explanation will be provided. The first step you will take is to load the data. For data loading, you're loading the same data you used in lesson one. To load your data and ensure it conforms to the appropriate model you saw in lesson one, you will use the process records within the custom utils module. This method takes in the data set with loaded and then conforms each data point to the model specified in lesson one. The result of this operation is a Python list containing data points, where each data point is an Airbnb listing that conforms to the specified model. The next step is to make a connection to the database. Connecting to the database has been moved to the costume utils module. Calling the function connect to database will execute the process of connecting to the MongoDB cluster and obtaining objects representing the database and collection. You will unpack the return results which will provide you with a database object and the collection object. This is the same step you did in lesson one. Here, we've streamlined it to a simple function call. The next step is to ensure you are working with a clean collection. In lesson one you ingested some data into your MongoDB collection. In this lesson, running the delete menu from the collection object will result in deletion of the data ingested into the collection in lesson one. You will observe the number of data it has been deleted by the delete many operation. Here, we have 100 data points deleted. The next step is to ingest the data. In this step, you ingest all the listing data that was created earlier into your MongoDB collection by calling the insert many method on the collection object and passing in the listings as its argument, the ingestion process will begin. On the screen, you will see a print statement indicating that the ingestion process is completed. The next step is to create a vector search index. You created a vector search index in lesson one. In lesson two, this extensive code has been streamlined into a code within a custom_utils modules named "Set up Vector Search Index". This will take in a collection for which to create the index for. This is an expected result. You've gotten a duplicated index. Recall, you created an index in lesson one already. Don't worry, you can carry on with the lesson. The next step is to compose the vector search query. This is a step you also took in lesson one, and it was one of the most important part of the process. You would just go over the code again. It's still the same as lesson one. Recall, the vector search function takes in a few arguments of the user query, the database, and the collection converts the user query into an embedding, uses the embedding within a vector search operation, and creates a pipeline with the vector search stage and any additional stage, and calls the aggregate method on the pipeline to get the result. You also print out the execution time of the vector search stage, and finally the result of the database operation is returned by the vector search function. The next step is to handle user queries. The code for this lesson is similar to the previous lesson. The only difference is we have different attributes and you are using a custom utils to get the address module. In a similar fashion to the previous lesson, the handle user query is pretty much the same. The handle user query will take in the arguments which are the the user query, the database object, and the collection object. Also some defaults argument of the stages and the vector index name. You start the process by getting some search results from the vector search operation. The handle user query also conforms the results from a database operation to the search results item model specified in the previous cell. The search results are converted into a dictionary and passed into the model, with the query as additional context. Finally, you extract the system response and print it out on a notebook in a structured manner. The handle user query returns the final system response. Now, the fun begins where you implement, post-filtering process that is conducted after the vector search operation. In this cell, we are specifying the path address.country. Essentially, the filtering you'll be conducting will mimic a scenario where your app user only wants to see listings in the United States. First, you specify the path of where the country is located within the document. This is located in the address field, specifically in the country field. You specify the path to this and assign it to the variable search path. To create a match stage, you use the dollar operator with the specific fields search path, which takes in the string to search for. You will notice you're also adding another limitation or filtering of the document based on the capacity a listing can accommodate. In real life scenario, there are situations where a listing would only want to take a certain amount of people, or need a certain amount of people. You can mimic this in your query by specifying conditional operators or conditional statements within your query. For this filter or match stage, you will limit the documents return to listings that can only accommodate greater than one person or less than five persons, and this is the filtering condition you will be adding after the vector search operation. Finally, you can pass in the match stage and assign it to a variable called "Additional" stage. This cell contains the user query. It's similar to the same user query in the previous lesson where the user wants to stay in a place that's warm and friendly. That's not too far from restaurants. The outputs will also be similar where we see, user query and system response and the documents that were used for additional context in the table structure. You will observe that the documents returned are limited to the United States because you are passing the additional stage that contains that match filter. This is the prompt you're passing to the system, you also pass in the additional stage, which is a match stage created in the previous cell. You will observe that the vector search operation took a fraction of a millisecond. The system recommended easy one bedroom in Chelsea. One thing I want you to do is to pause this video and observe the location of the documents, users additional contacts. You will notice they're all from the United States. You also observe that the accommodates for each data point is between the numbers one and five. Here, you can see that the locations are all from the United States. Let's begin with the second half of the fun, which is adding a pre-filter to the vector search operation. Here, you are creating the filter before the vector search operation is conducted. To conduct pre-filtering efficiently, in this cell you are creating a new vector search index. This is similar to the vector such index created in the previous lesson. The difference here is, we are creating the index with the fields you are going to be filtering the results of the data operation on. These are specifically the accommodates field that was added in the match stage previously under bedrooms. It's important to create an efficient vector search index for your collection. This allows retrieval of information all documents to be performant and not take too long. As previously done, you need to name your vector search index. You will call this specific index "vector index with filter". Distinguishing it from the previous index created in a previous lesson. To create the index, you will call the create such index function on the collection object, which will return the created index. Again, you be defining a vector search function. But this vector, search function is going to be different. Let me explain how. We have a similar vector search function you implemented in a previous cell. The difference is, in the vector stage you are adding a filter field. This filter field conducts the filter operation before conducting the vector search operation. This filter is similar to what you implemented in a match stage, where you specified a condition or filter based on the accommodates field. In this filter, you are adding an additional step to limit the results that are considered for vector search operation to ones that have bedrooms that are less than or equal to seven. With MongoDB, you can add conditional operation and let the database handle the logic. Using the and dollar operator. You can pass in conditions that limit the documents returned from database operations. This is essentially the pre-filter in step. And then the vector search process will be conducted. The rest of the code remains the same. Where you create the pipeline, and then execute the pipeline to obtain the result. You will also view the execution time for the vector search query. The final step is to handle the user query, where you will be using the same query as specified previously. But the difference here is in the handle user query and what you pass in as arguments into this function. Observe there is no additional stage you're passing into the handle user query to be considered for the database operation. What you will pass in is a new vector search index that was created with a filter, and also the vector search function now contains a filter component. As you can observe, the database operation, specifically the vector search operation took a fraction of a millisecond. And now the recommendation of the system is very different, as this recommended, a Sydney Hyde Park city apartment. This is different to the recommendation from the previous process earlier. Pause the video here and observe the results returned. One thing you will notice is, the return results meet the specified filters, which include the limits of the accommodates and the limit imposed on the bedrooms. Just to compare the two methods, which is the post filtering and pre-filtering. With the pre-filter, you will observe that the database operation returned to us 20 records. This is because we pre-filter the documents and the vector search operation returned the number of documents we specified. This is where you specified to return 20 documents. You will also notice a difference in the post-filtering, you specified 20 documents in the vector search operation, but only four was returned. This is your post-filtering results where you got just four documents from the database operation. This is because you conducted the vector search operation and then placed a match stage, which filtered the documents, returned to a lower number matching your condition. All right. In this lesson, you implemented a RAG pipeline with vector search that has a post-filtering step. Then you added a vector search operation with a pre-filtering step. And you observed the difference. In the next lesson, you're going to see how to reduce the amount of data returned from the database operation. See you there.
14 | Lesson 3: Projections
15 | Lesson Link: https://learn.deeplearning.ai/courses/prompt-compression-and-query-optimization/lesson/sge50/projections
16 | In this lesson, you'll learn how to streamline the outputs from database operation by incorporating a projection stage into the MongoDB aggregation pipeline. This will effectively reduce the amount of data returned, optimizing performance and data handling. All right, let's get on with it. In the previous lesson, you modified the fields within the documents returned after vector such and other stage operations by using pydantic and specifying the attributes that you wanted in the pydantic model. In this scenario, you're not using the entire fields in the documents returned by results of the aggregation pipeline, but you are leaving it to the application layer to handle the removal of unwanted attributes or fields. This can have disadvantages such as increased network traffic and processing time, as unwanted data must still be transmitted and then filtered out at the application layer. With a MongoDB database, the inclusion or exclusion of specific fields can be handled as another stage to add to the aggregation pipeline. This is done through a technique known as projection, which outputs the same number of documents as the previous stage before it, but reduces the fields return in each document. The projection technique within MongoDB works by specifying fields to include or exclude from the final documents. For example, the document representation of the Mona Lisa painting we used in previous lesson, can be reduced to a select few fields using the project operator in MongoDB, which you will get to implement in the code section soon. There are several advantages to projection with the inclusion of projection, the overall memory usage at the application layer reduces as less data is passed as results from database operations. This can also contribute to reduce query execution time. And there is the case of security and privacy. Take for example, a finance application where personal information and sensitive data are stored in documents. It can be useful to have the database handle the logic of removing sensitive information before being sent to downstream processes. This provides an overall improved sense of security in the application. In the coding section, you will go through familiar steps to implement a RAG system, add a filter stage, but then add an additional projection stage, and then you will proceed to handle the user query. Let's code. You will start by importing the custom utils as you did in the previous lesson. Then move on to downloading the dataset from HuggingFace as you also did in the previous lesson. You also load up the listings dataset by conforming to the pydantic model defined in the customer utils, just like you did in previous lesson, and move on to connect to your database and delete records in the collection and you observe the number of collections deleted. Just like the previous lesson, you will insert a new batch of records. Here, you're using a vector search index or a filter similar to the one you created in the last lesson. This code is moved into the custom utils module and you will load it by calling the set up vector search index of filter function and passing in the collection object. This will create a vector search index that is optimized to retrieve data with the accommodates and bedroom attributes. You will start by defining a search result model similar to what you've done in the previous lesson, this time with some new attributes such as score and notes. In the next cell, you implement the handle user query function. This is similar to the function that you've created in previous lesson. The main difference for this function is that were printing out the list of fields in the first document. You're doing this by accessing the first element returned from get knowledge and iterating through the keys. You are doing this to observe the fields of documents that are allowed through the projection stage before being done limited, and pass into the search result item model. Now, you're going to implement a projection stage and add it to the additional stages that will be passed into the vector search query function. You're going to define a variable called projection stage and assign it to a projection document. A projection stage is executed on a database operation and indicated by the dollar operator and the word project. This command takes in a document that represents the field that are to be projected. One thing to note is every document returned by the aggregation pipeline, will include an underscore ID field. This is returned automatically. You can exclude it by indicating the number zero as the value for the field. This is an exclusion. To include a field in a projection, you would mention the name of the field such as accommodates and assign it to value one to include. This is an inclusion pattern. As you can observe we follow the same pattern for the fields we want to project. By including fields to project, you are automatically excluding fields that are not mentioned. Now that is all you need to do for the projection stage. But also notice you are adding a score field and assigning the value of the vector such score to that field. This is a way to get the similarity score of the vector search operation into the document return from the database operation. Now, placing the projection stage into a Python list and assign that to the additional stage variable. One more thing, these are all the fields and attributes we want in our pydantic model, which should be included in any of your projection documents. This cell will look familiar as you've used it in a previous lesson. Here, you have the user query to look for places warm friendly. And here are the main changes. In the handle user query, you'll pass in the additional stage that contains the list, which includes the projection stage. The handle user query will also print fields of the first document before the documents are processed by the pydantic model. Also, you're using the vector index with filter for this vector the search operation. After running this cell, you'll observe that the database vector search operation was executed in a fraction of a millisecond. You'll also observe the fields that are included in the document are the ones we included in the projection. We have the name, summary space and other fields that we wanted to be projected and included in the documents. The results are still the same, and we're still getting the same documents. I want to show you one thing. When conducting a projection, it's important you maintain the pattern throughout for every field indicated. Meaning, if you're conducting an inclusion, use the one pattern throughout. If you're conducting an exclusion, use a zero pattern throughout. The only exception to this rule is the underscore ID field. Let me show you an example. You can change this number to zero to represent an exclusion. What will happen is a database operation failure. As you can access, you got an operation failure, but scrolling further down you will see the reason for this. The reason for this failure is because of an invalid project document. And this was caused because you can't exclude on an inclusion projection. To fix the operation failure error, simply place the value one to follow the inclusion pattern. Now, your results are back. One thing to note is because we've included a score field that shows the vector similarity search score. We can see that in the results as well. Let's have a look. Here, you can see the field score and the attributed vector search similarity score. The vector search similarity scores between a value 0.1 with one being a very close similarity measure. You can pause the video here to observe the scores of the documents that are returned from database operation. That concludes it for this lesson. In this lesson, you went through the typical pattern of setting up a RAG pipeline with the vector search indexes and you also ingested data into a database collection. And the new thing you've learned in this lesson is you've created and added a projection stage to the aggregation pipeline to limit the fields returned from the aggregation pipeline query. In the next lesson, you will see how you can add, boosting, and improve the relevance of the vector search operation by looking at qualitative and quantitative data and using it to affect the ranking of documents returned from the aggregation pipeline. See you in the next lesson.
17 | Lesson 4: Boosting
18 | Lesson Link: https://learn.deeplearning.ai/courses/prompt-compression-and-query-optimization/lesson/b3sle/boosting
19 | In this lesson, you dive into the techniques of reordering documents to improve information retrieval relevance and quality. You'll learn how to use specific metadata values to determine reordering position. Let's go. There are scenarios where a document can contain other fields that affect its position within such results. Take, for example, an Airbnb listing with rating and number of reviews field. This fields indicates qualitative and quantitative measures that can contribute to the relevance of a document with respect to a user query and search criteria. Taken into consideration, the value of these fields in order to affect the position of a document in the list of return search results, is referred to as boosting. Why should you consider adding a boosting technique in your search queries? Vector search is an effective method of ranking documents based on semantic similarity. Although vector search scores and ranking effective, metadata values can contribute to the document relevance, which can affect the ordering within search results. Using additional qualitative and quantitative measures to rank documents and shows database operation results are credible and relevant to user queries and their search criteria. Boosting can also be used to make sure results meet user specific requirements, which introduce personalization within search results. In the coding section, you're going to go through some familiar steps. The first would be to set up a rank pipeline, and you will add the relevant stages. Then, you add a boosting logic which will use some mathematical operators available within MongoDB database. And as usual, you handle the user query and visualize results. Let's code. Start by importing your custom utils module like you've done in the previous lesson. Move on to load the data. Also, like you've done in previous lessons. You can also take some time to view the attributes of each data points. Move on to the document modeling which loads the listing into a conformed model. This is similar to process that you've carried out in previous lessons. The next step is to get an object of your database and your collection. Then start of a clean collection by calling the delete any on the collection record. This is similar to the process that has been carried out in previous lessons. Go through the data ingestion process and move on to the vector search index definition process. All similar to the previous lesson. Now, you define a search result item model for the results shown in this lesson. The attributes for each results needs to contain a combined score, number of reviews, and average review scores. These new attributes will be explained later. Again, just like in the previous lesson, you have to handle user query function with the exact same code. Now, we can get to the main aspects of this lesson. You'll be implementing a boosting logic and adding it to the vector search operations conducted on the aggregation pipeline. Here, you are assigning to a variable named "review average stage". In this cell, what is happening is, we are adding two new fields to every document returned from the database operation. The first field is the average review score. Now, this is a qualitative measure I was talking about. The average review score is going to go through every review component of a document and take an average of a sum of the review components. So within every document we can see the accuracy, the cleanliness, the check in and other attributes of a listing. Get the score and with the dollar operator, specifically the dollar add which conducts the mathematical operation of an addition, we can get a sum of all the review component, and then we can divide it by the number of review components, which in this case for the listings in our data set six. This gives us an idea of what the average rating of a listing is. That explains the new field that we're adding to every document called the average review score. The second field that has been added to every document is the review count boost. This is a quantity measure, and this field will take the value of the number of reviews attributes in each document. This is how you can pass the value of one field to a new field. Simply using the dollar operator and the name of the field. To add this new field to every document in the database operation, you can add this process as a new stage, specifically the add fields stage. That concludes adding the qualitative measure and the quantitative measure. In the next step, you'll need to add weights and determine how each component of the quantitative measure and the quantitative measure should affect the ranking of a document after a vector search operation. This is done by adding a new stage to our pipeline. This is the weighting stage. Now the weighting stage comes right after the review average stage. So, the weighting stage will then have a reference of the average review score and the review count boost that was added to each document in the review average stage. This is how you can reference the values of these fields from the documents. To implement a weighting logic, you will use several operators enabled by MongoDB database to conduct mathematical operation. The add operator and the multiply operator. For the multiply operator, you will multiply the value of the average review score, which is the qualitative measure by a weight. I'm using the number between 0 and 1 to assign a weight. Then do the same for the review count boost, which is a quantitative measure that will be considered to rank the document after the vector search operation. You then use the add operator to combine the two results from the different multiplication operations. Assign this new additional value to the field combined score. The combined score is the combination of the two multiplied value, and we can add the combined score to each document within the database operation by using the add fields operator. This is the weighting stage. There is one more stage to complete this process. The final stage is the sorting stage. The sorting stage is very simple. Using the dollar operator sort, we can actually rerank the documents based on their combined score or a certain field. In this case, you are using the combined score and you are reranking it in descending order. So, this is indicated by minus one ascending order will be indicated by a one. Now that you have all the additional stages implemented to add to the vector search operation, you can create a new variable called Additional Stages, that takes a list of all the defined stages. The first is the review stage, where we conduct a mathematical operation to gain the qualitative and quantitative measure and add it as a new field to the documents After the vector search operation. Then there is a weighting stage, and then there is a sorting stage. All the stages are executed sequentially after the vector search operation. Remember, the vector search operation we're using in this lesson, is a pre-filter in vector search. Similar to the ones you've created in previous lessons. Now it's time for you to see the results of the boosting logic. Using the same query from previous lessons and also the same function from previous lesson, the handle user query function, you will pass in the additional stage and make note to use the vector index with filter. Here, you can observe that the vector search operation stage was conducted in a fraction of a millisecond. Now, let's observe the documents that were returned from this operation that included a combination of stages to simulate a boosting logic. Here you can see the results of the database operation that included multiple stages. The average review score is included along with the numbers of reviews and the combined score. Remember, the combined score includes a weightage consideration. Now, the documents shown are ordered by the combined score. You can pause the video here and observe the combined score of the other documents. One thing to note, is that because of the weighting logic we added, you will observe that despite this document, having a high rating, it is ranked lower in comparison to the other documents above it because it had a lower number of reviews. This is the impact of adding weights to the components you're considering for your boosting and logic. One more thing. You can play with the weights and adjust the numbers to see how it affects the results. To do this, simply go back to the weighting stage and adjust the weights. Now I'm giving higher weights to the review count and a lower one to the average reviews. Once you've changed the weights, you can observe the results again. As you can observe from the results, because you've added more weightage to the number of reviews a document with a high number of reviews is ranked higher than one with a high number of rating in comparison. Pause the video here to observe the results. In this lesson, you've learned how to implement a typical RAG system, conduct vector search. But now, you've added multiple stages to the aggregation pipeline to simulate a boost in logic, which adds more relevance and context to the ranking of your documents after a database operation. In the next lesson, you'll learn how you can utilize prompt compression to reduce the prompt that are sent to large language models in order to reduce operational costs. See you then!
20 | Lesson 5: Prompt Compression
21 | Lesson Link: https://learn.deeplearning.ai/courses/prompt-compression-and-query-optimization/lesson/ujs5z/prompt-compression
22 | In this lesson, you implement a cost-saving strategy of prompt compression, particularly valuable for applications like RAG and agentic systems. You gain an intuition of what prompt compression is, how to use it, and the operational advantages it brings to the LLM application. Let's get on with it. There are many prompting strategies that have emerged over the recent years such as in-context learning, chain of thought and react prompting. Getting appropriate and quality responses from LLM is an art form. Most of these prompting strategies involved compose an extensive text to the LLM as input. LLMs with large context window are becoming a new norm. Is now coming to see LLMs that can take an input size of over 100,000 tokens and even in some cases, a million token. That is passing an entire novel into an LLM in one inference call. Although useful in some cases utilizing the full context window when accessing LLMs provided through rest API calls can become very expensive. LLMs with large context windows have their place in real world application, but the operational costs of these models can skyrocket. Take for example, paying $10 per 1 million token and applications such as Airbnb, which has several million users per day. We have a huge operational expense just from the volume of interactions alone. Not to mention that there will be an increase latency in response, as the model will have to process more input to extract the appropriate information to respond to user queries. As you continue to learn and build AI application that use LLMs, you come across the idea of prompt compression, sometimes referred to as token compression. You might think I wouldn't have as much volume at the initial development of my AI application, but building robust AI applications requires thinking ahead of scalability and solving for issues that might become bottlenecks. You will implement prompt compression technique in the code section of this lesson, and observe firsthand how easy it is to implement prompt compression alongside existing RAG pipelines. So, prompts compression is a process of reducing the number of tokens. But let's see what this looks like in an example. On the screen, you can observe an original uncompressed prompt that spans across three long sentences. By using the package LLM lingua, which you you will use in the coding section of this lesson, we are able to reduce the uncompressed sentence into two sentences that span across two free rows. This is a power of prompt compression, which you will see first hand in the coding section. I have kept a link of the paper present in the prompt compression technique, in the link on the slides. Feel free to observe and read the research paper after this lesson. In a few minutes, you will compress an extensive prompt of a few thousand tokens down to a few hundred tokens. With prompt compression, passing input into the prompt compression technique is very straightforward. Imagine having an uncompressed prompt of 50,000 tokens, just passing the uncompressed prompt and specifying a few parameters using a prompt compression library LLM lingua, you can reduce the uncompressed prompt down to 10,000 tokens. This is a five times reduction, and then you can pass the input straight into the LLM as you would with the uncompressed prompt, and receive the same quality output as if the name was initially provided with the uncompressed prompt. You are about to see this in code. In the coding section of this lesson, you will go through some familiar steps, which include setting up the right pipeline ad in the relevant MongoDB stages and then implementing a compression logic. And as usual, you will handle the user query and observe the results. Let's code. Start by importing the custom utils module, as you've done in previous lesson. You move on to load your data sets. Where you can observe the attributes. Similar to previous lessons. The next steps are covered in previous lessons, where your model documents connected to your database, extracted objects for your database, and your collection. Deleted existing records within the collection. Ingested new data, and lastly, created your vector search index. Just like in previous lessons you handled the user query. You start by creating the search results item model which will specify the attributes you want from the document to returned from the database operation. For this case, you have the name, address, and other corresponding attributes. Just like in the previous lesson, you add the additional boosting stages. You have the review average stage. The weighting stage and the sorting stage. And then, you add all the additional stages into a variable called additional stages. Orders are the step you took in previous lessons. Now, we're in the main part of this lesson where we have a similar handle user query you've seen in the previous lesson, but you are printing out the uncompressed prompt for observation. This is specified in this two new print statement. Now, you have the same user query used in previous lesson. And also the same handle user query function, but with the difference in this lesson of the print statement where we can see the uncompressed prompt. From the output, you can observe the time it took for the vector such operation to execute, which is a fraction of a millisecond. You can also observe the uncompressed prompt. Here, you can see that the uncompressed prompt is extensive. And do note that it's been truncated to ensure it fits all on the screen. You can also view the full content of the prompt by looking at the documents returned from the vector search operation listed on the table. Pause the video here to take in the share size of what is being passed into the LLM. You will also notice that the system has recommended the homely room in five star new condo. Remember this listing. Now, this is a fun part. We're going to look at a technique that allows us to reduce the extensive prompt you observed before and reduce it by a few hundred tokens. You start this by importing the prompt compressor constructor from the LLM lingua library. Using the prompt compressor constructor, you can specify a smaller large language model that has been fine tuned for prompt compression to do the compression of an uncompressed prompt. You will also specify the utilization of the latest LLM lingua prompt compression logic by specifying true as the value for the argument use LLM Lingua two. You'll also specify to use the CPU to the prompts compression module to ensure you're using the CPU on the device. Now that we've set up our prompts, compressor, specifically set up a smaller language model to do the prompt compression. We can move on to define the compress prompt function. Now, you can define the compressed query prompt function, which you take in the uncompressed prompt as the query for the function. The prompt compress a module requires the input to be structured in a certain way. And that is having a component based structure with the fields specifically demonstration, instruction, and question. I will go over what this means. Demonstration will hold the context as uses additional information that is passed in to the LLM with the user query. This is essentially the documents that has been returned from the database operation. This is going to hold a specific instruction that tells the smaller large language model how to compress the prompt. Finally, the question is specifically the user query itself. Now, you can actually call the compress prompt method on the LLM lingual model that we initialize earlier on. I'm going to explain what each argument does. The first argument specifies how to split each of the contexts up, specifically using the new line. The second argument takes in the instruction. Then the question. Next, We specify the target token we want the uncompressed prompt to be compressed down to. Next, there is a specification of the compression algorithm to utilize. You'll be using the latest compression algorithm from the LLM lingua specifically long LLM lingua. Next, we'll specify the context budget, but allow the budget to overrun by 100 tokens. Finally, we'll specify the compression ratio. This ratio indicates how the compression logic to assign tokens to the context, which is a demonstration and to the overall instruction question. Finally, we enable the compressor to reorder the context using a sort algorithm. This is all the argument for the compressed prompt method. The results of the compressed query prompt function is a json representation of the compressed prompt that will include information such as the token which is the original token of the uncompressed prompt, and the compressed prompt token. You will see this in action in a second. Now that you've specified a method for compressing an uncompressed prompt, you'll begin to specify to handle user query with compression, which would take in the user query and conduct the compression that was defined earlier. This handle user query with compression is similar to previous handle user query function from previous lessons, but the key difference is in this function there's a new input to the LLM specified as query info. This query info follows the structure of our compression logic, which has the demonstration, instruction and question. Remember, the demonstration is just a result from the database operation. The instruction tells the compression module how we want the compression to be executed. And the user query or the question it's simply the user query. This is the structure that is passed into the compressed query prompt by calling the function defined earlier and passing in the query info, and assign the results to a variable called Compress prompt. To visualize the result, you printout the compressed prompt in a structured manner. Finally, the handle user query with compression returns the search results and the compressed prompt itself. The final method you implemented this lesson is the handle system response. The handle system response passes the query along with the compressed prompt as input into the LLM. For visualization, you print out the query and the system response. Now, you can use the handle user query with compression method defined earlier, by passing the query, the database object collection object, additional stages, and specifying the vector search index you're using for this lesson. Execute the cell. The execution for the cell might take some minutes, as you are using a smaller language model to compress a prompt. Although we have an increase latency, the overall operational cost will be reduced. Here is the result of the prompt compression technique. Here, we have a field compressed prompt that has all the prompts. And as you can see, this is much shorter than the prompt we saw earlier. But more importantly, we have the original tokens of the uncompressed prompt that was 4284 tokens long, and the new compressed token is just 512. This is a ratio of eight times compression. There is also an indication of the cost factor that you're saving when using this prompt compression technique, and taking the compressed prompt and passing is input to a GPT-4 model. In this case, for this particular call, we're saving $0.2. If you think about this for a large scale application such as Airbnb, where several millions of inference calls are made to APIs, this can be a saving factor in the hundreds of thousands. The last step of this lesson is to pass the compressed prompt and the user query to the large language model to actually get a system response. Confirm you have a compressed prompt, and then pass the compressed prompt and a query into the handle system response function. Here we can observe the result. The results of this compressed prompt is provided us with a recommendation that meets the user query closely. Specifically, it's in a warm, friendly neighborhood which was included in the spaces of the listing. And also it's next to restaurants, which is what was specified in the user query. We saved on the operational costs of the RAG pipeline and obtained a quality output. This output is not the same as the uncompressed prompt output, but it's of similar quality and meets the requirements specified in the user query. The difference between the results of the uncompressed prompt, which provided us with this recommendation, and the compressed prompt, which provided us with this recommendation, it's quite minimal. This signifies with a lower token count, you can get similar outputs in terms of quality from a large language model. This concludes this lesson. In this lesson, you learned how to create a RAG pipeline, placed a vector search, and also conducted prompt compression to get a quality output, as you would with an uncompressed prompt. Here are some additional resources I recommend you to take a look after completing this lesson. The first resource is the MongoDB Developer Center, which contains tutorials, articles, and video covering a variety of topics related to AI. Next, you have the GenAI Showcase repo. This contains different code and is a repository showcasing different use cases for RAG and agentic systems. And finally, the DeepLearning.AI forum where you can ask questions in regards to this course.
23 | Lesson 6: Conclusion
24 | Lesson Link: https://learn.deeplearning.ai/courses/prompt-compression-and-query-optimization/lesson/bgsip/conclusion
25 | Congratulations on completing this course. In this course you implemented vector search. You optimize RAG systems by using metadata and MongoDB aggregation pipeline to improve system efficiency and output relevance. And finally, you learned how to reduce LLM application operational costs by using prompt compression. I'm looking forward to seeing what you build on your own.
26 |
--------------------------------------------------------------------------------