├── .gitignore ├── LICENSE ├── README.md ├── complex_qa.py ├── images ├── baseline.png ├── call_types_table.png ├── end-to-end-query-engine.png ├── equation.png ├── intro.png ├── simple_rag.png ├── task_1_table.png ├── task_2_table.png └── task_3_table.png ├── llama_index_baseline.py ├── openai_utils.py ├── requirements.txt └── subquestion_generator.py /.gitignore: -------------------------------------------------------------------------------- 1 | data/ 2 | .vscode/ 3 | 4 | **/*.pyc 5 | **/__pycache__/ 6 | 7 | .env 8 | evadb_data/ 9 | **/data_* -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Demystifying Advanced RAG Pipelines 2 | 3 | Retrieval-Augmented Generation (RAG) pipelines powered by large language models (LLMs) are gaining popularity for building end-to-end question answering systems. Frameworks such as [LlamaIndex](https://github.com/run-llama/llama_index) and [Haystack](https://github.com/deepset-ai/haystack) have made significant progress in making RAG pipelines easy to use. While these frameworks provide excellent abstractions for building advanced RAG pipelines, they do so at the cost of transparency. From a user perspective, it's not readily apparent what's going on under the hood, particularly when errors or inconsistencies arise. 4 | 5 | In this [EvaDB](https://github.com/georgia-tech-db/evadb) application, we'll shed light on the inner workings of advanced RAG pipelines by examining the mechanics, limitations, and costs that often remain opaque. 6 | 7 |

8 | 9 |
10 | Llama working on a laptop 🙂 11 |

12 | 13 | ## Quick start 14 | 15 | If you want to jump right in, use the following commands to run the application: 16 | 17 | ``` 18 | pip install -r requirements.txt 19 | 20 | echo OPENAI_API_KEY='yourkey' > .env 21 | python complex_qa.py 22 | ``` 23 | 24 | ## RAG Overview 25 | 26 | Retrieval-augmented generation (RAG) is a cutting-edge AI paradigm for LLM-based question answering. 27 | A RAG pipeline typically contains: 28 | 29 | 1. **Data Warehouse** - A collection of data sources (e.g., documents, tables etc.) that contain information relevant to the question answering task. 30 | 31 | 2. **Vector Retrieval** - Given a question, find the top K most similar data chunks to the question. This is done using a vector store (e.g., [Faiss](https://faiss.ai/index.html)). 32 | 33 | 3. **Response Generation** - Given the top K most similar data chunks, generate a response using a large language model (e.g. GPT-4). 34 | 35 | RAG provides two key advantages over traditional LLM-based question answering: 36 | 1. **Up-to-date information** - The data warehouse can be updated in real-time, so the information is always up-to-date. 37 | 38 | 2. **Source tracking** - RAG provides clear traceability, enabling users to identify the sources of information, which is crucial for accuracy verification and mitigating LLM hallucinations. 39 | 40 | ## Building advanced RAG Pipelines 41 | 42 | To enable answering more complex questions, recent AI frameworks like LlamaIndex have introduced more advanced abstractions such as the [Sub-question Query Engine](https://gpt-index.readthedocs.io/en/latest/examples/query_engine/sub_question_query_engine.html). 43 | 44 | In this application, we'll demystify sophisticated RAG pipelines by using the Sub-question Query Engine as an example. We'll examine the inner workings of the Sub-question Query Engine and simplify the abstractions to their core components. We'll also identify some challenges associated with advanced RAG pipelines. 45 | 46 | ### The setup 47 | 48 | A data warehouse is a collection of data sources (e.g., documents, tables etc.) that contain information relevant to the question answering task. 49 | 50 | In this example, we'll use a simple data warehouse containing multiple Wikipedia articles for different popular cities, inspired by LlamaIndex's [illustrative use-case](https://docs.llamaindex.ai/en/stable/examples/index_structs/doc_summary/DocSummary.html). Each city's wiki is a separate data source. Note that for simplicity, we limit each document's size to fit within the LLM context limit. 51 | 52 | Our goal is to build a system that can answer questions like: 53 | 1. *"What is the population of Chicago?"* 54 | 2. *"Give me a summary of the positive aspects of Atlanta."* 55 | 3. *"Which city has the highest population?"* 56 | 57 | As you can see, the questions can be simple factoid/summarization questions over a single data source (Q1/Q2) or complex factoid/summarization questions over multiple data sources (Q3). 58 | 59 | We have the following *retrieval methods* at our disposal: 60 | 61 | 1. **vector retrieval** - Given a question and a data source, generate an LLM response using the top-K most similar data chunks to the question from the data source as the context. We use the off-the-shelf FAISS vector index from [EvaDB](https://github.com/georgia-tech-db/evadb) for vector retrieval. However, the concepts are applicable to any vector index. 62 | 63 | 2. **summary retrieval** - Given a summary question and a data source, generate an LLM response using the entire data source as context. 64 | 65 | ### The secret sauce 66 | 67 | Our key insight is that each component in an advanced RAG pipeline is powered by a single LLM call. The entire pipeline is a series of LLM calls with carefully crafted prompt templates. These prompt templates are the secret sauce that enable advanced RAG pipelines to perform complex tasks. 68 | 69 | In fact, any advanced RAG pipeline can be broken down into a series of individual LLM calls that follow a universal input pattern: 70 | 71 | ![equation](images/equation.png) 72 | 73 | 74 | where: 75 | - **Prompt Template** - A curated prompt template for the specific task (e.g., sub-question generation, summarization) 76 | - **Context** - The context to use to perform the task (e.g. top-K most similar data chunks) 77 | - **Question** - The question to answer 78 | 79 | Now, we illustrate this principle by examining the inner workings of the Sub-question Query Engine. 80 | 81 | The Sub-question Query Engine has to perform three tasks: 82 | 1. **Sub-question generation** - Given a complex question, break it down into a set of sub-questions, while identifying the appropriate data source and retrieval function for each sub-question. 83 | 2. **Vector/Summary Retrieval** - For each sub-question, use the chosen retrieval function over the corresponding data source to retrieve the relevant information. 84 | 3. **Response Aggregation** - Aggregate the responses from the sub-questions into a final response. 85 | 86 | Let's examine each task in detail. 87 | 88 | ### Task 1: Sub-question Generation 89 | 90 | Our goal is to break down a complex question into a set of sub-questions, while identifying the appropriate data source and retrieval function for each sub-question. For example, the question *"Which city has the highest population?"* is broken down into five sub-questions, one for each city, of the form *"What is the population of {city}?".* The data source for each sub-question has to be the corresponding city's wiki, and the retrieval function has to be vector retrieval. 91 | 92 | At first glance, this seems like a daunting task. Specifically, we need to answer the following questions: 93 | 1. **How do we know which sub-questions to generate?** 94 | 2. **How do we know which data source to use for each sub-question?** 95 | 3. **How do we know which retrieval function to use for each sub-question?** 96 | 97 | Remarkably, the answer to all three questions is the same - a single LLM call! The entire sub-question query engine is powered by a single LLM call with a carefully crafted prompt template. Let's call this template the **Sub-question Prompt Template**. 98 | 99 | ``` 100 | -- Sub-question Prompt Template -- 101 | 102 | """ 103 | You are an AI assistant that specializes in breaking down complex questions into simpler, manageable sub-questions. 104 | When presented with a complex user question, your role is to generate a list of sub-questions that, when answered, will comprehensively address the original question. 105 | You have at your disposal a pre-defined set of functions and data sources to utilize in answering each sub-question. 106 | If a user question is straightforward, your task is to return the original question, identifying the appropriate function and data source to use for its solution. 107 | Please remember that you are limited to the provided functions and data sources, and that each sub-question should be a full question that can be answered using a single function and a single data source. 108 | """ 109 | ``` 110 | 111 | The context for the LLM call is the names of the data sources and the functions available to the system. The question is the user question. The LLM outputs a list of sub-questions, each with a function and a data source. 112 | 113 | ![task_1_table](images/task_1_table.png) 114 | 115 | For the three example questions, the LLM returns the following output: 116 | 117 |
118 | 119 | LLM output Table 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 |
QuestionSubquestionsRetrieval methodData Source
"What is the population of Chicago?""What is the population of Chicago?"vector retrievalChicago
"Give me a summary of the positive aspects of Atlanta.""Give me a summary of the positive aspects of Atlanta."summary retrievalAtlanta
"Which city has the highest population?""What is the population of Toronto?"vector retrievalToronto
"What is the population of Chicago?"vector retrievalChicago
"What is the population of Houston?"vector retrievalHouston
"What is the population of Boston?"vector retrievalBoston
"What is the population of Atlanta?"vector retrievalAtlanta
171 |
172 | 173 | ### Task 2: Vector/Summary Retrieval 174 | 175 | For each sub-question, we use the chosen retrieval function over the corresponding data source to retrieve the relevant information. For example, for the sub-question *"What is the population of Chicago?"*, we use vector retrieval over the Chicago data source. Similarly, for the sub-question *"Give me a summary of the positive aspects of Atlanta."*, we use summary retrieval over the Atlanta data source. 176 | 177 | For both retrieval methods, we use the same LLM prompt template. In fact, we find that the popular **RAG Prompt** from [LangchainHub](https://smith.langchain.com/hub) works great out-of-the-box for this step. 178 | 179 | ``` 180 | -- RAG Prompt Template -- 181 | 182 | """ 183 | You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise. 184 | Question: {question} 185 | Context: {context} 186 | Answer: 187 | ``` 188 | 189 | Both the retrieval methods only differ in the context used for the LLM call. For vector retrieval, we use the top K most similar data chunks to the sub-question as context. For summary retrieval, we use the entire data source as context. 190 | 191 | ![task_2_table](images/task_2_table.png) 192 | 193 | ### Task 3: Response Aggregation 194 | 195 | This is the final step that aggregates the responses from the sub-questions into a final response. For example, for the question *"Which city has the highest population?"*, the sub-questions retrieve the population of each city and then response aggregation finds and returns the city with the highest population. 196 | The **RAG Prompt** works great for this step as well. 197 | 198 | The context for the LLM call is the list of responses from the sub-questions. The question is the original user question and the LLM outputs a final response. 199 | 200 | ![task_3_table](images/task_3_table.png) 201 | 202 | ### Putting it all together 203 | 204 | After unraveling the layers of abstraction, we uncovered the secret ingredient powering the sub-question query engine - 4 types of LLM calls each with different prompt template, context, and a question. This fits the universal input pattern that we identified earlier perfectly, and is a far cry from the complex abstractions that we started with. 205 | To summarize: 206 | ![equation](images/equation.png) 207 | ![call_types_table](images/call_types_table.png) 208 | 209 | To see the full pipeline in action, run the following commands: 210 | 211 | ``` 212 | pip install -r requirements.txt 213 | 214 | echo OPENAI_API_KEY='yourkey' > .env 215 | python complex_qa.py 216 | ``` 217 | 218 | Here is an example of the system answering the question *"Which city with the highest population?"*. 219 | 220 | ![full_pipeline](images/simple_rag.png) 221 | 222 | ## Challenges 223 | 224 | Now that we've demystified the inner workings of advanced RAG pipelines, let's examine the challenges associated with them. 225 | 226 | 1. **Question sensitivity** - The biggest challenge that we observed with these systems is the question sensitivity. The LLMs are extremely sensitive to the user question, and the pipeline fails unexpectedly for several user questions. Here are a few example failure cases that we encountered: 227 | - **Incorrect sub-questions** - The LLM sometimes generates incorrect sub-questions. For example, *"Which city has the highest number of tech companies?"* is broken down into *"What are the tech companies in each city?"* 5 times (once for each city) instead of *"What is the number of tech companies in Toronto?"*, *"What is the number of tech companies in Chicago?"*, etc. 228 | - **Incorrect retrieval function** - *"Summarize the positive aspects of Atlanta and Toronto."* results in using the vector retrieval function instead of the summary retrieval method. 229 | 230 | We had to put in significant effort into prompt engineering to get the pipeline to work for each question. This is a significant challenge for building robust systems. 231 | 232 | To verify this behavior, we [implemented the example](llama_index_baseline.py) using the LlamaIndex Sub-question query engine. Consistent with our observations, the system often generates the wrong sub-questions and also uses the wrong retrieval function for the sub-questions, as shown below. 233 | 234 | ![llama_index_baseline](images/baseline.png) 235 | 236 | 237 | 2. **Cost** - The second challenge is the cost dynamics of advanced RAG pipelines. The issue is two-fold: 238 | - **Cost sensitivity** - The final cost of the question is dependent on the number of sub-questions generated, the retrieval function used, and the number of data sources queried. Since the LLMs are sensitive to the prompt, the cost of the question can vary significantly depending on the question and the LLM output. For example, the incorrect model choice in the LlamaIndex baseline example above (`summary_tool`) results in a 3x higher cost compared to the `vector_tool` while also generating an incorrect response. 239 | - **Cost estimation** - Advanced abstractions in RAG frameworks obscure the estimated cost of the question. Setting up a cost monitoring system is challenging since the cost of the question is dependent on the LLM output. 240 | 241 | 242 | ## Conclusion 243 | 244 | Advanced RAG pipelines powered by LLMs have revolutionized question-answering systems. 245 | However, as we have seen, these pipelines are not turnkey solutions. Under the hood, they rely on carefully engineered prompt templates and multiple chained LLM calls. As illustrated in this [EvaDB](https://github.com/georgia-tech-db/evadb) application, these pipelines can be question-sensitive, brittle, and opaque in their cost dynamics. Understanding these intricacies is key to leveraging their full potential and paving the way for more robust and efficient systems in the future. 246 | 247 | 248 | 300 | -------------------------------------------------------------------------------- /complex_qa.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dotenv import load_dotenv 3 | from pathlib import Path 4 | import requests 5 | 6 | import warnings 7 | warnings.filterwarnings("ignore") 8 | 9 | from subquestion_generator import generate_subquestions 10 | import evadb 11 | from openai_utils import llm_call 12 | 13 | 14 | if not load_dotenv(): 15 | print( 16 | "Could not load .env file or it is empty. Please check if it exists and is readable." 17 | ) 18 | exit(1) 19 | 20 | 21 | def generate_vector_stores(cursor, docs): 22 | """Generate a vector store for the docs using evadb. 23 | """ 24 | for doc in docs: 25 | print(f"Creating vector store for {doc}...") 26 | cursor.query(f"DROP TABLE IF EXISTS {doc};").df() 27 | cursor.query(f"LOAD DOCUMENT 'data/{doc}.txt' INTO {doc};").df() 28 | evadb_path = os.path.dirname(evadb.__file__) 29 | cursor.query( 30 | f"""CREATE FUNCTION IF NOT EXISTS SentenceFeatureExtractor 31 | IMPL '{evadb_path}/functions/sentence_feature_extractor.py'; 32 | """).df() 33 | 34 | cursor.query( 35 | f"""CREATE TABLE IF NOT EXISTS {doc}_features AS 36 | SELECT SentenceFeatureExtractor(data), data FROM {doc};""" 37 | ).df() 38 | 39 | cursor.query( 40 | f"CREATE INDEX IF NOT EXISTS {doc}_index ON {doc}_features (features) USING FAISS;" 41 | ).df() 42 | print(f"Successfully created vector store for {doc}.") 43 | 44 | 45 | def vector_retrieval(cursor, llm_model, question, doc_name): 46 | """Returns the answer to a factoid question using vector retrieval. 47 | """ 48 | res_batch = cursor.query( 49 | f"""SELECT data FROM {doc_name}_features 50 | ORDER BY Similarity(SentenceFeatureExtractor('{question}'),features) 51 | LIMIT 3;""" 52 | ).df() 53 | context_list = [] 54 | for i in range(len(res_batch)): 55 | context_list.append(res_batch["data"][i]) 56 | context = "\n".join(context_list) 57 | user_prompt = f"""You are an assistant for question-answering tasks. 58 | Use the following pieces of retrieved context to answer the question. 59 | If you don't know the answer, just say that you don't know. 60 | Use three sentences maximum and keep the answer concise. 61 | Question: {question} 62 | Context: {context} 63 | Answer:""" 64 | 65 | response, cost = llm_call(model=llm_model, user_prompt=user_prompt) 66 | 67 | answer = response.choices[0].message.content 68 | return answer, cost 69 | 70 | 71 | def summary_retrieval(llm_model, question, doc): 72 | """Returns the answer to a summarization question over the document using summary retrieval. 73 | """ 74 | # context_length = OPENAI_MODEL_CONTEXT_LENGTH[llm_model] 75 | # total_tokens = get_num_tokens_simple(llm_model, wiki_docs[doc]) 76 | user_prompt = f"""Here is some context: {doc} 77 | Use only the provided context to answer the question. 78 | Here is the question: {question}""" 79 | 80 | response, cost = llm_call(model=llm_model, user_prompt=user_prompt) 81 | answer = response.choices[0].message.content 82 | return answer, cost 83 | # load max of context_length tokens from the document 84 | 85 | 86 | def response_aggregator(llm_model, question, responses): 87 | """Aggregates the responses from the subquestions to generate the final response. 88 | """ 89 | print("-------> ⭐ Aggregating responses...") 90 | system_prompt = """You are an assistant for question-answering tasks. 91 | Use the following pieces of retrieved context to answer the question. 92 | If you don't know the answer, just say that you don't know. 93 | Use three sentences maximum and keep the answer concise.""" 94 | 95 | context = "" 96 | for i, response in enumerate(responses): 97 | context += f"\n{response}" 98 | 99 | user_prompt = f"""Question: {question} 100 | Context: {context} 101 | Answer:""" 102 | 103 | response, cost = llm_call(model=llm_model, system_prompt=system_prompt, user_prompt=user_prompt) 104 | answer = response.choices[0].message.content 105 | return answer, cost 106 | 107 | 108 | def load_wiki_pages(page_titles=["Toronto", "Chicago", "Houston", "Boston", "Atlanta"]): 109 | 110 | # Download all wiki documents 111 | for title in page_titles: 112 | response = requests.get( 113 | "https://en.wikipedia.org/w/api.php", 114 | params={ 115 | "action": "query", 116 | "format": "json", 117 | "titles": title, 118 | "prop": "extracts", 119 | # 'exintro': True, 120 | "explaintext": True, 121 | }, 122 | ).json() 123 | page = next(iter(response["query"]["pages"].values())) 124 | wiki_text = page["extract"] 125 | 126 | data_path = Path("data") 127 | if not data_path.exists(): 128 | Path.mkdir(data_path) 129 | 130 | with open(data_path / f"{title}.txt", "w") as fp: 131 | fp.write(wiki_text) 132 | 133 | # Load all wiki documents 134 | city_docs = {} 135 | for wiki_title in page_titles: 136 | input_text = open(f"data/{wiki_title}.txt", "r").read() 137 | city_docs[wiki_title] = input_text[:10000] 138 | return city_docs 139 | 140 | 141 | if __name__ == "__main__": 142 | 143 | # establish evadb api cursor 144 | print("⏳ Connect to EvaDB...") 145 | cursor = evadb.connect().cursor() 146 | print("✅ Connected to EvaDB...") 147 | 148 | doc_names = ["Toronto", "Chicago", "Houston", "Boston", "Atlanta"] 149 | wiki_docs = load_wiki_pages(page_titles=doc_names) 150 | 151 | question = "Which city has the highest population?" 152 | 153 | user_task = """We have a database of wikipedia articles about several cities. 154 | We are building an application to answer questions about the cities.""" 155 | 156 | vector_stores = generate_vector_stores(cursor, wiki_docs) 157 | 158 | llm_model = "gpt-3.5-turbo" 159 | total_cost = 0 160 | while True: 161 | question_cost = 0 162 | # Get question from user 163 | question = str(input("Question (enter 'exit' to exit): ")) 164 | if question.lower() == "exit": 165 | break 166 | print("🧠 Generating subquestions...") 167 | subquestions_bundle_list, cost = generate_subquestions(question=question, 168 | file_names=doc_names, 169 | user_task=user_task, 170 | llm_model=llm_model) 171 | question_cost += cost 172 | responses = [] 173 | for q_no, item in enumerate(subquestions_bundle_list): 174 | subquestion = item.question 175 | selected_func = item.function.value 176 | selected_doc = item.file_name.value 177 | print(f"\n-------> 🤔 Processing subquestion #{q_no+1}: {subquestion} | function: {selected_func} | data source: {selected_doc}") 178 | if selected_func == "vector_retrieval": 179 | response, cost = vector_retrieval(cursor, llm_model, subquestion, selected_doc) 180 | elif selected_func == "llm_retrieval": 181 | response, cost = summary_retrieval(llm_model, subquestion, wiki_docs[selected_doc]) 182 | else: 183 | print(f"\nCould not process subquestion: {subquestion} function: {selected_func} data source: {selected_doc}\n") 184 | exit(0) 185 | print(f"✅ Response #{q_no+1}: {response}") 186 | responses.append(response) 187 | question_cost += cost 188 | 189 | aggregated_response, cost = response_aggregator(llm_model, question, responses) 190 | question_cost += cost 191 | print(f"\n✅ Final response: {aggregated_response}") 192 | print(f"🤑 Total cost for the question: ${question_cost:.4f}") 193 | total_cost += question_cost 194 | 195 | print(f"Total cost for all questions: ${total_cost:.4f}") 196 | -------------------------------------------------------------------------------- /images/baseline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pchunduri6/rag-demystified/e7b38d89ed5671675a9299a697e413483b75cfd6/images/baseline.png -------------------------------------------------------------------------------- /images/call_types_table.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pchunduri6/rag-demystified/e7b38d89ed5671675a9299a697e413483b75cfd6/images/call_types_table.png -------------------------------------------------------------------------------- /images/end-to-end-query-engine.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pchunduri6/rag-demystified/e7b38d89ed5671675a9299a697e413483b75cfd6/images/end-to-end-query-engine.png -------------------------------------------------------------------------------- /images/equation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pchunduri6/rag-demystified/e7b38d89ed5671675a9299a697e413483b75cfd6/images/equation.png -------------------------------------------------------------------------------- /images/intro.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pchunduri6/rag-demystified/e7b38d89ed5671675a9299a697e413483b75cfd6/images/intro.png -------------------------------------------------------------------------------- /images/simple_rag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pchunduri6/rag-demystified/e7b38d89ed5671675a9299a697e413483b75cfd6/images/simple_rag.png -------------------------------------------------------------------------------- /images/task_1_table.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pchunduri6/rag-demystified/e7b38d89ed5671675a9299a697e413483b75cfd6/images/task_1_table.png -------------------------------------------------------------------------------- /images/task_2_table.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pchunduri6/rag-demystified/e7b38d89ed5671675a9299a697e413483b75cfd6/images/task_2_table.png -------------------------------------------------------------------------------- /images/task_3_table.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pchunduri6/rag-demystified/e7b38d89ed5671675a9299a697e413483b75cfd6/images/task_3_table.png -------------------------------------------------------------------------------- /llama_index_baseline.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import requests 4 | 5 | from llama_index import ( 6 | VectorStoreIndex, 7 | SummaryIndex, 8 | SimpleKeywordTableIndex, 9 | SimpleDirectoryReader, 10 | ServiceContext, 11 | ) 12 | from llama_index.schema import IndexNode 13 | from llama_index.tools import QueryEngineTool, ToolMetadata 14 | from llama_index.llms import OpenAI, AzureOpenAI 15 | from llama_index.query_engine import SubQuestionQueryEngine 16 | from llama_index.agent import OpenAIAgent 17 | from llama_index.embeddings import HuggingFaceEmbedding, OpenAIEmbedding 18 | from llama_index.callbacks import CallbackManager, TokenCountingHandler 19 | from llama_index.response_synthesizers import get_response_synthesizer 20 | import tiktoken 21 | 22 | api_type = "" 23 | api_base = "" 24 | api_version = "" 25 | api_key = "" 26 | 27 | 28 | embed_model_name = "hugging_face" 29 | 30 | if embed_model_name == "hugging_face": 31 | embed_model = HuggingFaceEmbedding( 32 | model_name="sentence-transformers/all-mpnet-base-v2", max_length=512 33 | ) 34 | elif embed_model_name == "text-embedding-ada-002": 35 | embed_model = OpenAIEmbedding( 36 | model="text-embedding-ada-002", 37 | deployment_name="text-embedding-ada-002", 38 | api_key=api_key, 39 | api_base=api_base, 40 | api_type=api_type, 41 | api_version=api_version, 42 | ) 43 | 44 | llm = AzureOpenAI( 45 | model="gpt-3.5-turbo", 46 | engine="gpt-35-turbo", 47 | api_key=api_key, 48 | api_base=api_base, 49 | api_type=api_type, 50 | api_version=api_version, 51 | ) 52 | 53 | token_counter = TokenCountingHandler( 54 | tokenizer=tiktoken.encoding_for_model("gpt-3.5-turbo").encode 55 | ) 56 | 57 | callback_manager = CallbackManager([token_counter]) 58 | 59 | service_context = ServiceContext.from_defaults( 60 | # system_prompt=system_prompt, 61 | llm=llm, 62 | callback_manager=callback_manager, 63 | embed_model=embed_model, 64 | ) 65 | 66 | 67 | def print_token_count(token_counter, embed_model, model="gpt-35-turbo"): 68 | print( 69 | "Embedding Tokens: ", 70 | token_counter.total_embedding_token_count, 71 | "\n", 72 | "LLM Prompt Tokens: ", 73 | token_counter.prompt_llm_token_count, 74 | "\n", 75 | "LLM Completion Tokens: ", 76 | token_counter.completion_llm_token_count, 77 | "\n", 78 | "Total LLM Token Count: ", 79 | token_counter.total_llm_token_count, 80 | "\n", 81 | ) 82 | pricing = { 83 | 'gpt-35-turbo': {'prompt': 0.0015, 'completion': 0.002}, 84 | 'gpt-35-turbo-16k': {'prompt': 0.003, 'completion': 0.004}, 85 | 'gpt-4-0613': {'prompt': 0.03, 'completion': 0.06}, 86 | 'gpt-4-32k': {'prompt': 0.06, 'completion': 0.12}, 87 | 'embedding': {'hugging_face': 0, 'text-embedding-ada-002': 0.0001} 88 | } 89 | print( 90 | "Embedding Cost: ", 91 | pricing['embedding'][embed_model] * token_counter.total_embedding_token_count/1000, 92 | "\n", 93 | "LLM Prompt Cost: ", 94 | pricing[model]["prompt"] * token_counter.prompt_llm_token_count/1000, 95 | "\n", 96 | "LLM Completion Cost: ", 97 | pricing[model]["completion"] * token_counter.completion_llm_token_count/1000, 98 | "\n", 99 | "Total LLM Cost: ", 100 | pricing[model]["prompt"] * token_counter.prompt_llm_token_count/1000 + pricing[model]["completion"] * token_counter.completion_llm_token_count/1000, 101 | "\n", 102 | "Total cost: ", 103 | pricing['embedding'][embed_model] * token_counter.total_embedding_token_count/1000 + pricing[model]["prompt"] * token_counter.prompt_llm_token_count/1000 + pricing[model]["completion"] * token_counter.completion_llm_token_count/1000, 104 | ) 105 | 106 | 107 | if __name__ == "__main__": 108 | wiki_titles = ["Toronto", "Chicago", "Houston", "Boston", "Atlanta"] 109 | 110 | for title in wiki_titles: 111 | response = requests.get( 112 | "https://en.wikipedia.org/w/api.php", 113 | params={ 114 | "action": "query", 115 | "format": "json", 116 | "titles": title, 117 | "prop": "extracts", 118 | # 'exintro': True, 119 | "explaintext": True, 120 | }, 121 | ).json() 122 | page = next(iter(response["query"]["pages"].values())) 123 | wiki_text = page["extract"] 124 | 125 | data_path = Path("data") 126 | if not data_path.exists(): 127 | Path.mkdir(data_path) 128 | 129 | with open(data_path / f"{title}.txt", "w") as fp: 130 | fp.write(wiki_text) 131 | 132 | # Load all wiki documents 133 | city_docs = {} 134 | for wiki_title in wiki_titles: 135 | city_docs[wiki_title] = SimpleDirectoryReader( 136 | input_files=[f"data/{wiki_title}.txt"] 137 | ).load_data() 138 | 139 | # # Build agents dictionary 140 | # agents = {} 141 | 142 | query_engine_tools = [] 143 | for wiki_title in wiki_titles: 144 | # build vector index 145 | vector_index = VectorStoreIndex.from_documents( 146 | city_docs[wiki_title], service_context=service_context 147 | ) 148 | # build summary index 149 | summary_index = SummaryIndex.from_documents( 150 | city_docs[wiki_title], service_context=service_context 151 | ) 152 | # define query engines 153 | vector_query_engine = vector_index.as_query_engine() 154 | list_query_engine = summary_index.as_query_engine() 155 | 156 | # define tools 157 | query_engine_tools_per_doc = [ 158 | QueryEngineTool( 159 | query_engine=vector_query_engine, 160 | metadata=ToolMetadata( 161 | name=f"vector_tool_{wiki_title}", 162 | description="Useful for questions related to specific aspects of" 163 | f" {wiki_title} (e.g. the history, arts and culture," 164 | " sports, demographics, or more).", 165 | ), 166 | ), 167 | QueryEngineTool( 168 | query_engine=list_query_engine, 169 | metadata=ToolMetadata( 170 | name=f"summary_tool_{wiki_title}", 171 | description="Useful for any requests that require a holistic summary" 172 | f" of EVERYTHING about {wiki_title}. For questions about" 173 | " more specific sections, please use the" 174 | f" vector_tool_{wiki_title}.", 175 | ), 176 | ), 177 | ] 178 | 179 | query_engine_tools.extend(query_engine_tools_per_doc) 180 | 181 | # build agent 182 | # function_llm = OpenAI(model="gpt-3.5-turbo-0613") 183 | # agent = OpenAIAgent.from_tools( 184 | # query_engine_tools, 185 | # llm=llm, 186 | # verbose=True, 187 | # ) 188 | 189 | # agents[wiki_title] = agent 190 | 191 | response_synthesizer = get_response_synthesizer( 192 | service_context=service_context, 193 | response_mode="compact", 194 | ) 195 | 196 | sub_query_engine = SubQuestionQueryEngine.from_defaults( 197 | query_engine_tools=query_engine_tools, 198 | response_synthesizer=response_synthesizer, 199 | service_context=service_context, 200 | use_async=False, 201 | verbose=True, 202 | ) 203 | 204 | question = "Which are the sports teams in Toronto?" 205 | print("Question: ", question) 206 | response = sub_query_engine.query(question) 207 | print_token_count(token_counter, embed_model_name) 208 | -------------------------------------------------------------------------------- /openai_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import logging 4 | 5 | from openai import OpenAI 6 | client = OpenAI() 7 | import tiktoken 8 | 9 | from tenacity import ( 10 | retry, 11 | stop_after_attempt, 12 | wait_random_exponential, 13 | after_log, 14 | ) # for exponential backoff 15 | 16 | logging.basicConfig(stream=sys.stderr, level=logging.INFO) 17 | logger = logging.getLogger(__name__) 18 | 19 | OPENAI_PRICING = { 20 | "gpt-3.5-turbo": {"prompt": 0.0015, "completion": 0.002}, 21 | "gpt-3.5-turbo-16k": {"prompt": 0.003, "completion": 0.004}, 22 | "gpt-4": {"prompt": 0.03, "completion": 0.06}, 23 | "gpt-4-32k": {"prompt": 0.06, "completion": 0.12}, 24 | "embedding": {"hugging_face": 0, "text-embedding-ada-002": 0.0001}, 25 | } 26 | 27 | 28 | OPENAI_MODEL_CONTEXT_LENGTH = { 29 | "gpt-3.5-turbo": 4097, 30 | "gpt-3.5-turbo-16k": 16385, 31 | "gpt-4-0613": 8192, 32 | "gpt-4-32k": 32768, 33 | } 34 | 35 | 36 | @retry( 37 | wait=wait_random_exponential(min=1, max=60), 38 | stop=stop_after_attempt(6), 39 | after=after_log(logger, logging.INFO), 40 | ) 41 | def completion_with_backoff(**kwargs): 42 | return client.chat.completions.create(**kwargs) 43 | 44 | 45 | def llm_call_cost(response): 46 | """Returns the cost of the LLM call in dollars""" 47 | model = response.model 48 | usage = response.usage 49 | prompt_cost = OPENAI_PRICING[model]["prompt"] 50 | completion_cost = OPENAI_PRICING[model]["completion"] 51 | prompt_token_cost = (usage.prompt_tokens * prompt_cost) / 1000 52 | completion_token_cost = (usage.completion_tokens * completion_cost) / 1000 53 | return prompt_token_cost + completion_token_cost 54 | 55 | 56 | def llm_call( 57 | model, 58 | function_schema=None, 59 | output_schema=None, 60 | system_prompt="You are an AI assistant that answers user questions using the context provided.", 61 | user_prompt="Please help me answer the following question:", 62 | few_shot_examples=None, 63 | ): 64 | kwargs = {} 65 | if function_schema is not None: 66 | kwargs["functions"] = function_schema 67 | if output_schema is not None: 68 | kwargs["function_call"] = output_schema 69 | 70 | messages = [] 71 | if system_prompt is not None: 72 | messages.append({"role": "system", "content": system_prompt}) 73 | if few_shot_examples is not None: 74 | messages.extend(few_shot_examples) 75 | if user_prompt is not None: 76 | messages.append({"role": "user", "content": user_prompt}) 77 | 78 | response = completion_with_backoff( 79 | model=model, 80 | temperature=0, 81 | messages=messages, 82 | **kwargs 83 | ) 84 | 85 | # print cost of call 86 | call_cost = llm_call_cost(response) 87 | print(f"🤑 LLM call cost: ${call_cost:.4f}") 88 | return response, call_cost 89 | 90 | 91 | def get_num_tokens_simple(model, prompt): 92 | """Estimate the number of tokens in the prompt using tiktoken""" 93 | encoding = tiktoken.encoding_for_model(model) 94 | num_tokens = len(encoding.encode(prompt)) 95 | return num_tokens 96 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | evadb[document] 2 | openai>=1.0 3 | instructor 4 | pydantic==2.4.0 5 | python-dotenv==1.0.0 6 | tiktoken 7 | tenacity -------------------------------------------------------------------------------- /subquestion_generator.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import List 3 | from enum import Enum 4 | 5 | from instructor import OpenAISchema 6 | from pydantic import Field, create_model 7 | from openai_utils import llm_call 8 | 9 | 10 | # DEFAULT_SUBQUESTION_GENERATOR_PROMPT = """ 11 | # You are an AI agent that takes a complex user question and returns a list of simple subquestions to answer the user's question. 12 | # You are provided a set of functions and data sources that you can use to answer each subquestion. 13 | # If the user question is simple, just return the user question, the function, and the data source to use. 14 | # You can only use the provided functions and data sources. 15 | # The subquestions should be complete questions that can be answered by a single function and a single data source. 16 | # """ 17 | 18 | # DEFAULT_SUBQUESTION_GENERATOR_PROMPT = """ 19 | # You are an AI assistant that specializes in breaking down complex questions into simpler, manageable sub-questions. 20 | # When presented with a complex user question, your role is to generate a list of sub-questions that, when answered, will comprehensively address the original query. 21 | # You have at your disposal a pre-defined set of functions and data sources to utilize in answering each sub-question. 22 | # If a user question is straightforward, your task is to return the original question, identifying the appropriate function and data source to use for its solution. 23 | # Please remember that you are limited to the provided functions and data sources, and that each sub-question should be a full question that can be answered using a single function and a single data source. 24 | # """ 25 | 26 | DEFAULT_SUBQUESTION_GENERATOR_PROMPT = """ 27 | You are an AI assistant that specializes in breaking down complex questions into simpler, manageable sub-questions. 28 | You have at your disposal a pre-defined set of functions and files to utilize in answering each sub-question. 29 | Please remember that your output should only contain the provided function names and file names, and that each sub-question should be a full question that can be answered using a single function and a single file. 30 | """ 31 | 32 | DEFAULT_USER_TASK = "" 33 | 34 | 35 | class FunctionEnum(str, Enum): 36 | """The function to use to answer the questions. 37 | Use vector_retrieval for fact-based questions such as demographics, sports, arts and culture, etc. 38 | Use llm_retrieval for summarization questions, such as positive aspects, history, etc. 39 | """ 40 | 41 | VECTOR_RETRIEVAL = "vector_retrieval" 42 | LLM_RETRIEVAL = "llm_retrieval" 43 | 44 | 45 | def generate_subquestions( 46 | question, 47 | file_names: List[str] = None, 48 | system_prompt=DEFAULT_SUBQUESTION_GENERATOR_PROMPT, 49 | user_task=DEFAULT_USER_TASK, 50 | llm_model="gpt-4-0613", 51 | ): 52 | """Generates a list of subquestions from a user question along with the 53 | file name and the function to use to answer the question using OpenAI LLM. 54 | """ 55 | FilenameEnum = Enum("FilenameEnum", {x.upper(): x for x in file_names}) 56 | FilenameEnum.__doc__ = f"The names of the file to use to answer the corresponding subquestion - e.g. {file_names[0]}" 57 | 58 | # Create pydantic class dynamically 59 | QuestionBundle = create_model( 60 | "QuestionBundle", 61 | question=( 62 | str, 63 | Field( 64 | None, description="The subquestion extracted from the user's question" 65 | ), 66 | ), 67 | function=(FunctionEnum, Field(None)), 68 | file_name=(FilenameEnum, Field(None)), 69 | ) 70 | 71 | SubQuestionBundleList = create_model( 72 | "SubQuestionBundleList", 73 | subquestion_bundle_list=( 74 | List[QuestionBundle], 75 | Field( 76 | None, 77 | description="A list of subquestions - each item in the list contains a question, a function, and a file name", 78 | ), 79 | ), 80 | __base__=OpenAISchema, 81 | ) 82 | 83 | user_prompt = f"{user_task}\n Here is the user question: {question}" 84 | 85 | few_shot_examples = [ 86 | { 87 | "role": "user", 88 | "content": "Compare the population of Atlanta and Toronto?", 89 | }, 90 | { 91 | "role": "function", 92 | "name": "SubQuestionBundleList", 93 | "content": """ 94 | { 95 | "subquestion_bundle_list": [ 96 | { 97 | "question": "What is the population of Atlanta?", 98 | "function": "vector_retrieval", 99 | "file_name": "Atlanta" 100 | }, 101 | { 102 | "question": "What is the population of Toronto?" 103 | "function": "vector_retrieval", 104 | "file_name": "Toronto" 105 | } 106 | ] 107 | }""", 108 | }, 109 | { 110 | "role": "user", 111 | "content": "Summarize the history of Chicago and Houston.", 112 | }, 113 | { 114 | "role": "function", 115 | "name": "SubQuestionBundleList", 116 | "content": """ 117 | { 118 | "subquestion_bundle_list": [ 119 | { 120 | "question": "What is the history of Chicago?", 121 | "function": "llm_retrieval", 122 | "file_name": "Chicago" 123 | }, 124 | { 125 | "question": "What is the history of Houston?", 126 | "function": "llm_retrieval", 127 | "file_name": "Houston" 128 | } 129 | ] 130 | }""", 131 | }, 132 | ] 133 | 134 | response, cost = llm_call( 135 | model=llm_model, 136 | function_schema=[SubQuestionBundleList.openai_schema], 137 | output_schema={"name": SubQuestionBundleList.openai_schema["name"]}, 138 | system_prompt=system_prompt, 139 | user_prompt=user_prompt, 140 | few_shot_examples=few_shot_examples, 141 | ) 142 | 143 | subquestions_list = json.loads(response.choices[0].message.function_call.arguments) 144 | 145 | subquestions_pydantic_obj = SubQuestionBundleList(**subquestions_list) 146 | subquestions_list = subquestions_pydantic_obj.subquestion_bundle_list 147 | return subquestions_list, cost 148 | --------------------------------------------------------------------------------