├── README.md
└── DSPyRAG.py


/README.md:
--------------------------------------------------------------------------------
 1 | DSPyRAG README
 2 | 
 3 | This repository contains code implementing a RAG (Retriever-Answer-Generator) system using DSPy, a framework for enhancing language model (LM) performance. Below are the key components and functionalities of this code:
 4 | 
 5 | Import Modules: The code imports necessary modules and sets up environmental variables for the project.
 6 | 
 7 | Configure Models: It configures the language model (GPT-3.5 Turbo) and the retriever model (ColBERTv2) using DSPy.
 8 | 
 9 | Parse File and Create Index: Utilizing LlamaParse, the code parses a PDF file and creates a search index using VectorStoreIndex.
10 | 
11 | Define Signature and Modules: It defines a signature for generating answers and creates a RAG module to handle question answering tasks.
12 | 
13 | Define Validation Logic and Create Training Set: Validation logic is defined to evaluate predicted answers. Additionally, a training set with example questions and answers is created.
14 | 
15 | Set up Teleprompter: A BootstrapFewShot teleprompter is set up to compile the RAG module with the training set.
16 | 
17 | Usage: The compiled RAG module is used to answer questions about the PDF file.
18 | 
19 | For more details on usage and functionality, refer to the code comments and outputs provided in the repository.
20 | 


--------------------------------------------------------------------------------
/DSPyRAG.py:
--------------------------------------------------------------------------------
  1 | # Import modules
  2 | import os
  3 | import sys
  4 | import dspy
  5 | import pkg_resources
  6 | from dspy import Signature, InputField, OutputField, Module, Predict, Prediction
  7 | from llama_parse import LlamaParse
  8 | from llama_index.core import VectorStoreIndex, StorageContext, load_index_from_storage
  9 | from dspy.teleprompt import BootstrapFewShot
 10 | from dspy.evaluate import answer_exact_match, answer_passage_match
 11 | from dspy import Example
 12 | 
 13 | # Set environmental variables
 14 | os.environ["OPENAI_API_KEY"] = "sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx7M"
 15 | openai_api_key = os.environ["OPENAI_API_KEY"]
 16 | 
 17 | # Define path to project
 18 | repo_path = 'C:\\Users\\user\\Documents\\Jan 2024\\Projects\\RAGs\\New\\DSPy\\DSPyRAG'
 19 | 
 20 | # Add the project path to your system path
 21 | if repo_path not in sys.path:
 22 |     sys.path.append(repo_path)
 23 | 
 24 | # Set up the cache for this script
 25 | os.environ["DSP_NOTEBOOK_CACHEDIR"] = os.path.join(repo_path, 'cache')
 26 | 
 27 | # Check if dspy-ai is installed
 28 | if not "dspy-ai" in {pkg.key for pkg in pkg_resources.working_set}:
 29 |     print("Please install dspy-ai and openai using pip")
 30 | 
 31 | # Configure LM
 32 | turbo = dspy.OpenAI(model='gpt-3.5-turbo')
 33 | dspy.settings.configure(lm=turbo)
 34 | 
 35 | # Parse file
 36 | parser = LlamaParse(
 37 |     api_key="llx-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxGh7",
 38 |         result_type="text",
 39 |         language="en",
 40 |         varbose=True
 41 |     )
 42 | 
 43 | # Create documents and index
 44 | documents = parser.load_data("C:\\Users\\user\\Documents\\Jan 2024\\Projects\\RAGs\\Files\\PhilDataset.pdf")
 45 | print("Documents created")
 46 | index = VectorStoreIndex.from_documents(documents)
 47 | 
 48 | index.set_index_id("vector_index")
 49 | index.storage_context.persist("./storage")
 50 | 
 51 | storage_context = StorageContext.from_defaults(persist_dir="storage")
 52 | 
 53 | # Create query engine as index
 54 | index = load_index_from_storage(storage_context, index_id="vector_index")
 55 | query_engine = index.as_query_engine(response_mode="tree_summarize")
 56 | 
 57 | # Create signature
 58 | class GenerateAnswer(dspy.Signature):
 59 |     """Answer questions with short factoid answers."""
 60 |     context = dspy.InputField(desc="may contain relevant facts")
 61 |     question = dspy.InputField()
 62 |     answer = dspy.OutputField(desc="Often between 5 and 10 words")
 63 |     print("Class 1 created")
 64 | 
 65 | # Define modules
 66 | class RAG(dspy.Module):
 67 |     def __init__(self, num_passages=3):
 68 |         super().__init__()
 69 |         self.query_engine = query_engine
 70 |         self.generate_answer = Predict(GenerateAnswer)
 71 |         print("Class 2 created")
 72 | 
 73 |     def forward(self, question):
 74 |         response = self.query_engine.query(question)
 75 |         context = response.response
 76 |         prediction = self.generate_answer(context=context, question=question)
 77 |         return dspy.Prediction(context=context, answer=prediction.answer)
 78 | custom_rag = RAG(query_engine)
 79 | 
 80 | question = "What did Phil wanted to become when he grew up?"
 81 | pred = custom_rag(question)
 82 | print(f"Question: {question}")
 83 | print(f"Predicted Answer: {pred.answer}")
 84 | 
 85 | # Create validation logic 
 86 | def validate_context_and_answer(example, pred, trace=None):
 87 |     answer_EM = answer_exact_match(example, pred)
 88 |     answer_PM = answer_passage_match(example, pred)
 89 |     return answer_EM and answer_PM
 90 | 
 91 | # Define examples with the necessary fields
 92 | train_example1 = Example(question="What did young Philemon wanted to become when he grew up?", answer="Engineer")
 93 | train_example2 = Example(question="What did Philemon realize his curiosity was pushing him towards as he grew older?", answer="Sciences")
 94 | train_example3 = Example(question="How many years after graduation did Philemon spent working in the academic writing industry?", answer="Eight")
 95 | train_example4 = Example(question="Which is one of the subjects that Philemon handled in academic writing assignments?", answer="Nursing")
 96 | train_example5 = Example(question="What made the global academic system to go into hibernation?", answer="Covid")
 97 | train_example6 = Example(question="Which year did the usual peak season failed to materialize?", answer="2021")
 98 | train_example7 = Example(question="When was the ranking systems introduced to deny all other writers the chance to see available orders?", answer="2023")
 99 | train_example8 = Example(question="In 2024, how many orders had Philemon completed until February 15?", answer="4")
100 | train_example9 = Example(question="What was the main reason Philemon wanted to branch into other high-demand fields?", answer="Income")
101 | train_example10 = Example(question="What did Philemon eventually venture into in his undergraduate studies?", answer="Chemistry")
102 | 
103 | # Tell DSPy that the 'question' field is the input
104 | trainset = [
105 |     train_example1.with_inputs('question'),
106 |     train_example2.with_inputs('question'),
107 |     train_example3.with_inputs('question'),
108 |     train_example4.with_inputs('question'),
109 |     train_example5.with_inputs('question'),
110 |     train_example6.with_inputs('question'),
111 |     train_example7.with_inputs('question'),
112 |     train_example8.with_inputs('question'),
113 |     train_example9.with_inputs('question'),
114 |     train_example10.with_inputs('question'),
115 | ]
116 | 
117 | print("Trainset created")
118 | 
119 | # Set up teleprompter
120 | teleprompter = BootstrapFewShot(metric=validate_context_and_answer)
121 | 
122 | compiled_rag = teleprompter.compile(custom_rag, trainset=trainset)
123 | 
124 | # Use compiled_rag to answer questions about your PDF!
125 | question = "When did the rationing of orders took a policy direction?"
126 | pred = compiled_rag(question)
127 | print(f"Question: {question}")
128 | print(f"Predicted Answer: {pred.answer}")
129 | print("Retrieved Contexts:")
130 | for context in pred.context:
131 |     full_context = ''.join(context)
132 |     print(full_context)
133 | 
134 | 
135 | #Output
136 | #Started parsing the file under job_id 65bd7202-7285-44d3-8f02-7a1a115a4367
137 | #Documents created
138 | 
139 | #Question: What did Phil wanted to become when he grew up?
140 | #Predicted Answer: An engineer
141 | 
142 | #100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [03:00<00:00, 18.05s/it]
143 | #Bootstrapped 1 full traces after 10 examples in round 0.
144 | 
145 | #Question: When did the rationing of orders took a policy direction?
146 | #Predicted Answer: 2023
147 | #Retrieved Contexts:
148 | #The rationing of orders took a policy direction in 2023.


--------------------------------------------------------------------------------