├── README.md └── DSPyRAG.py /README.md: -------------------------------------------------------------------------------- 1 | DSPyRAG README 2 | 3 | This repository contains code implementing a RAG (Retriever-Answer-Generator) system using DSPy, a framework for enhancing language model (LM) performance. Below are the key components and functionalities of this code: 4 | 5 | Import Modules: The code imports necessary modules and sets up environmental variables for the project. 6 | 7 | Configure Models: It configures the language model (GPT-3.5 Turbo) and the retriever model (ColBERTv2) using DSPy. 8 | 9 | Parse File and Create Index: Utilizing LlamaParse, the code parses a PDF file and creates a search index using VectorStoreIndex. 10 | 11 | Define Signature and Modules: It defines a signature for generating answers and creates a RAG module to handle question answering tasks. 12 | 13 | Define Validation Logic and Create Training Set: Validation logic is defined to evaluate predicted answers. Additionally, a training set with example questions and answers is created. 14 | 15 | Set up Teleprompter: A BootstrapFewShot teleprompter is set up to compile the RAG module with the training set. 16 | 17 | Usage: The compiled RAG module is used to answer questions about the PDF file. 18 | 19 | For more details on usage and functionality, refer to the code comments and outputs provided in the repository. 20 | -------------------------------------------------------------------------------- /DSPyRAG.py: -------------------------------------------------------------------------------- 1 | # Import modules 2 | import os 3 | import sys 4 | import dspy 5 | import pkg_resources 6 | from dspy import Signature, InputField, OutputField, Module, Predict, Prediction 7 | from llama_parse import LlamaParse 8 | from llama_index.core import VectorStoreIndex, StorageContext, load_index_from_storage 9 | from dspy.teleprompt import BootstrapFewShot 10 | from dspy.evaluate import answer_exact_match, answer_passage_match 11 | from dspy import Example 12 | 13 | # Set environmental variables 14 | os.environ["OPENAI_API_KEY"] = "sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx7M" 15 | openai_api_key = os.environ["OPENAI_API_KEY"] 16 | 17 | # Define path to project 18 | repo_path = 'C:\\Users\\user\\Documents\\Jan 2024\\Projects\\RAGs\\New\\DSPy\\DSPyRAG' 19 | 20 | # Add the project path to your system path 21 | if repo_path not in sys.path: 22 | sys.path.append(repo_path) 23 | 24 | # Set up the cache for this script 25 | os.environ["DSP_NOTEBOOK_CACHEDIR"] = os.path.join(repo_path, 'cache') 26 | 27 | # Check if dspy-ai is installed 28 | if not "dspy-ai" in {pkg.key for pkg in pkg_resources.working_set}: 29 | print("Please install dspy-ai and openai using pip") 30 | 31 | # Configure LM 32 | turbo = dspy.OpenAI(model='gpt-3.5-turbo') 33 | dspy.settings.configure(lm=turbo) 34 | 35 | # Parse file 36 | parser = LlamaParse( 37 | api_key="llx-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxGh7", 38 | result_type="text", 39 | language="en", 40 | varbose=True 41 | ) 42 | 43 | # Create documents and index 44 | documents = parser.load_data("C:\\Users\\user\\Documents\\Jan 2024\\Projects\\RAGs\\Files\\PhilDataset.pdf") 45 | print("Documents created") 46 | index = VectorStoreIndex.from_documents(documents) 47 | 48 | index.set_index_id("vector_index") 49 | index.storage_context.persist("./storage") 50 | 51 | storage_context = StorageContext.from_defaults(persist_dir="storage") 52 | 53 | # Create query engine as index 54 | index = load_index_from_storage(storage_context, index_id="vector_index") 55 | query_engine = index.as_query_engine(response_mode="tree_summarize") 56 | 57 | # Create signature 58 | class GenerateAnswer(dspy.Signature): 59 | """Answer questions with short factoid answers.""" 60 | context = dspy.InputField(desc="may contain relevant facts") 61 | question = dspy.InputField() 62 | answer = dspy.OutputField(desc="Often between 5 and 10 words") 63 | print("Class 1 created") 64 | 65 | # Define modules 66 | class RAG(dspy.Module): 67 | def __init__(self, num_passages=3): 68 | super().__init__() 69 | self.query_engine = query_engine 70 | self.generate_answer = Predict(GenerateAnswer) 71 | print("Class 2 created") 72 | 73 | def forward(self, question): 74 | response = self.query_engine.query(question) 75 | context = response.response 76 | prediction = self.generate_answer(context=context, question=question) 77 | return dspy.Prediction(context=context, answer=prediction.answer) 78 | custom_rag = RAG(query_engine) 79 | 80 | question = "What did Phil wanted to become when he grew up?" 81 | pred = custom_rag(question) 82 | print(f"Question: {question}") 83 | print(f"Predicted Answer: {pred.answer}") 84 | 85 | # Create validation logic 86 | def validate_context_and_answer(example, pred, trace=None): 87 | answer_EM = answer_exact_match(example, pred) 88 | answer_PM = answer_passage_match(example, pred) 89 | return answer_EM and answer_PM 90 | 91 | # Define examples with the necessary fields 92 | train_example1 = Example(question="What did young Philemon wanted to become when he grew up?", answer="Engineer") 93 | train_example2 = Example(question="What did Philemon realize his curiosity was pushing him towards as he grew older?", answer="Sciences") 94 | train_example3 = Example(question="How many years after graduation did Philemon spent working in the academic writing industry?", answer="Eight") 95 | train_example4 = Example(question="Which is one of the subjects that Philemon handled in academic writing assignments?", answer="Nursing") 96 | train_example5 = Example(question="What made the global academic system to go into hibernation?", answer="Covid") 97 | train_example6 = Example(question="Which year did the usual peak season failed to materialize?", answer="2021") 98 | train_example7 = Example(question="When was the ranking systems introduced to deny all other writers the chance to see available orders?", answer="2023") 99 | train_example8 = Example(question="In 2024, how many orders had Philemon completed until February 15?", answer="4") 100 | train_example9 = Example(question="What was the main reason Philemon wanted to branch into other high-demand fields?", answer="Income") 101 | train_example10 = Example(question="What did Philemon eventually venture into in his undergraduate studies?", answer="Chemistry") 102 | 103 | # Tell DSPy that the 'question' field is the input 104 | trainset = [ 105 | train_example1.with_inputs('question'), 106 | train_example2.with_inputs('question'), 107 | train_example3.with_inputs('question'), 108 | train_example4.with_inputs('question'), 109 | train_example5.with_inputs('question'), 110 | train_example6.with_inputs('question'), 111 | train_example7.with_inputs('question'), 112 | train_example8.with_inputs('question'), 113 | train_example9.with_inputs('question'), 114 | train_example10.with_inputs('question'), 115 | ] 116 | 117 | print("Trainset created") 118 | 119 | # Set up teleprompter 120 | teleprompter = BootstrapFewShot(metric=validate_context_and_answer) 121 | 122 | compiled_rag = teleprompter.compile(custom_rag, trainset=trainset) 123 | 124 | # Use compiled_rag to answer questions about your PDF! 125 | question = "When did the rationing of orders took a policy direction?" 126 | pred = compiled_rag(question) 127 | print(f"Question: {question}") 128 | print(f"Predicted Answer: {pred.answer}") 129 | print("Retrieved Contexts:") 130 | for context in pred.context: 131 | full_context = ''.join(context) 132 | print(full_context) 133 | 134 | 135 | #Output 136 | #Started parsing the file under job_id 65bd7202-7285-44d3-8f02-7a1a115a4367 137 | #Documents created 138 | 139 | #Question: What did Phil wanted to become when he grew up? 140 | #Predicted Answer: An engineer 141 | 142 | #100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [03:00<00:00, 18.05s/it] 143 | #Bootstrapped 1 full traces after 10 examples in round 0. 144 | 145 | #Question: When did the rationing of orders took a policy direction? 146 | #Predicted Answer: 2023 147 | #Retrieved Contexts: 148 | #The rationing of orders took a policy direction in 2023. --------------------------------------------------------------------------------