├── .gitignore
├── LICENSE
├── README.md
├── complex_qa.py
├── images
    ├── baseline.png
    ├── call_types_table.png
    ├── end-to-end-query-engine.png
    ├── equation.png
    ├── intro.png
    ├── simple_rag.png
    ├── task_1_table.png
    ├── task_2_table.png
    └── task_3_table.png
├── llama_index_baseline.py
├── openai_utils.py
├── requirements.txt
└── subquestion_generator.py


/.gitignore:
--------------------------------------------------------------------------------
1 | data/
2 | .vscode/
3 | 
4 | **/*.pyc
5 | **/__pycache__/
6 | 
7 | .env
8 | evadb_data/
9 | **/data_*


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Demystifying Advanced RAG Pipelines
  2 | 
  3 | Retrieval-Augmented Generation (RAG) pipelines powered by large language models (LLMs) are gaining popularity for building end-to-end question answering systems. Frameworks such as [LlamaIndex](https://github.com/run-llama/llama_index) and [Haystack](https://github.com/deepset-ai/haystack) have made significant progress in making RAG pipelines easy to use. While these frameworks provide excellent abstractions for building advanced RAG pipelines, they do so at the cost of transparency. From a user perspective, it's not readily apparent what's going on under the hood, particularly when errors or inconsistencies arise. 
  4 | 
  5 | In this [EvaDB](https://github.com/georgia-tech-db/evadb) application, we'll shed light on the inner workings of advanced RAG pipelines by examining the mechanics, limitations, and costs that often remain opaque.
  6 | 
  7 | <p align="center">
  8 |   <img width="70%" src="images/intro.png" title="llama working on a laptop to retrieve data" >
  9 |   <br>
 10 |   <b><i>Llama working on a laptop</i> 🙂</b>
 11 | </p>
 12 | 
 13 | ## Quick start
 14 | 
 15 | If you want to jump right in, use the following commands to run the application:
 16 | 
 17 | ```
 18 | pip install -r requirements.txt
 19 | 
 20 | echo OPENAI_API_KEY='yourkey' > .env
 21 | python complex_qa.py
 22 | ```
 23 | 
 24 | ## RAG Overview
 25 | 
 26 | Retrieval-augmented generation (RAG) is a cutting-edge AI paradigm for LLM-based question answering.
 27 | A RAG pipeline typically contains:
 28 | 
 29 | 1. **Data Warehouse** - A collection of data sources (e.g., documents, tables etc.) that contain information relevant to the question answering task.
 30 | 
 31 | 2. **Vector Retrieval** - Given a question, find the top K most similar data chunks to the question. This is done using a vector store (e.g., [Faiss](https://faiss.ai/index.html)).
 32 | 
 33 | 3. **Response Generation** - Given the top K most similar data chunks, generate a response using a large language model (e.g. GPT-4).
 34 | 
 35 | RAG provides two key advantages over traditional LLM-based question answering:
 36 | 1. **Up-to-date information** - The data warehouse can be updated in real-time, so the information is always up-to-date.
 37 | 
 38 | 2. **Source tracking** - RAG provides clear traceability, enabling users to identify the sources of information, which is crucial for accuracy verification and mitigating LLM hallucinations.
 39 | 
 40 | ## Building advanced RAG Pipelines
 41 | 
 42 | To enable answering more complex questions, recent AI frameworks like LlamaIndex have introduced more advanced abstractions such as the [Sub-question Query Engine](https://gpt-index.readthedocs.io/en/latest/examples/query_engine/sub_question_query_engine.html).
 43 | 
 44 | In this application, we'll demystify sophisticated RAG pipelines by using the Sub-question Query Engine as an example. We'll examine the inner workings of the Sub-question Query Engine and simplify the abstractions to their core components. We'll also identify some challenges associated with advanced RAG pipelines.
 45 | 
 46 | ### The setup
 47 | 
 48 | A data warehouse is a collection of data sources (e.g., documents, tables etc.) that contain information relevant to the question answering task.
 49 | 
 50 | In this example, we'll use a simple data warehouse containing multiple Wikipedia articles for different popular cities, inspired by LlamaIndex's [illustrative use-case](https://docs.llamaindex.ai/en/stable/examples/index_structs/doc_summary/DocSummary.html). Each city's wiki is a separate data source. Note that for simplicity, we limit each document's size to fit within the LLM context limit.
 51 | 
 52 | Our goal is to build a system that can answer questions like:
 53 | 1. *"What is the population of Chicago?"*
 54 | 2. *"Give me a summary of the positive aspects of Atlanta."*
 55 | 3. *"Which city has the highest population?"*
 56 | 
 57 | As you can see, the questions can be simple factoid/summarization questions over a single data source (Q1/Q2) or complex factoid/summarization questions over multiple data sources (Q3).
 58 | 
 59 | We have the following *retrieval methods* at our disposal:
 60 | 
 61 | 1. **vector retrieval** - Given a question and a data source, generate an LLM response using the top-K most similar data chunks to the question from the data source as the context. We use the off-the-shelf FAISS vector index from [EvaDB](https://github.com/georgia-tech-db/evadb) for vector retrieval. However, the concepts are applicable to any vector index.
 62 | 
 63 | 2. **summary retrieval** - Given a summary question and a data source, generate an LLM response using the entire data source as context.
 64 | 
 65 | ### The secret sauce
 66 | 
 67 | Our key insight is that each component in an advanced RAG pipeline is powered by a single LLM call. The entire pipeline is a series of LLM calls with carefully crafted prompt templates. These prompt templates are the secret sauce that enable advanced RAG pipelines to perform complex tasks.
 68 | 
 69 | In fact, any advanced RAG pipeline can be broken down into a series of individual LLM calls that follow a universal input pattern:
 70 | 
 71 | ![equation](images/equation.png)
 72 | 
 73 | <!-- LLM input = **Prompt Template** + **Context** + **Question** -->
 74 | where:
 75 | - **Prompt Template** - A curated prompt template for the specific task (e.g., sub-question generation, summarization)
 76 | - **Context** - The context to use to perform the task (e.g. top-K most similar data chunks)
 77 | - **Question** - The question to answer
 78 | 
 79 | Now, we illustrate this principle by examining the inner workings of the Sub-question Query Engine.
 80 | 
 81 | The Sub-question Query Engine has to perform three tasks:
 82 | 1. **Sub-question generation** - Given a complex question, break it down into a set of sub-questions, while identifying the appropriate data source and retrieval function for each sub-question.
 83 | 2. **Vector/Summary Retrieval** - For each sub-question, use the chosen retrieval function over the corresponding data source to retrieve the relevant information.
 84 | 3. **Response Aggregation** - Aggregate the responses from the sub-questions into a final response.
 85 | 
 86 | Let's examine each task in detail.
 87 | 
 88 | ### Task 1: Sub-question Generation
 89 | 
 90 | Our goal is to break down a complex question into a set of sub-questions, while identifying the appropriate data source and retrieval function for each sub-question. For example, the question *"Which city has the highest population?"* is broken down into five sub-questions, one for each city, of the form *"What is the population of {city}?".* The data source for each sub-question has to be the corresponding city's wiki, and the retrieval function has to be vector retrieval.
 91 | 
 92 | At first glance, this seems like a daunting task. Specifically, we need to answer the following questions:
 93 | 1. **How do we know which sub-questions to generate?**
 94 | 2. **How do we know which data source to use for each sub-question?**
 95 | 3. **How do we know which retrieval function to use for each sub-question?**
 96 | 
 97 | Remarkably, the answer to all three questions is the same - a single LLM call! The entire sub-question query engine is powered by a single LLM call with a carefully crafted prompt template. Let's call this template the **Sub-question Prompt Template**.
 98 | 
 99 | ```
100 | -- Sub-question Prompt Template --
101 | 
102 | """
103 |     You are an AI assistant that specializes in breaking down complex questions into simpler, manageable sub-questions.
104 |     When presented with a complex user question, your role is to generate a list of sub-questions that, when answered, will comprehensively address the original question.
105 |     You have at your disposal a pre-defined set of functions and data sources to utilize in answering each sub-question.
106 |     If a user question is straightforward, your task is to return the original question, identifying the appropriate function and data source to use for its solution.
107 |     Please remember that you are limited to the provided functions and data sources, and that each sub-question should be a full question that can be answered using a single function and a single data source.
108 | """
109 | ```
110 | 
111 | The context for the LLM call is the names of the data sources and the functions available to the system. The question is the user question. The LLM outputs a list of sub-questions, each with a function and a data source.
112 | 
113 | ![task_1_table](images/task_1_table.png)
114 | 
115 | For the three example questions, the LLM returns the following output:
116 | 
117 | <details>
118 |   <summary>
119 |     LLM output Table
120 |   </summary>
121 | <table>
122 | <thead>
123 |   <tr>
124 |     <th>Question</th>
125 |     <th>Subquestions</th>
126 |     <th>Retrieval method</th>
127 |     <th>Data Source</th>
128 |   </tr>
129 | </thead>
130 | <tbody>
131 |   <tr>
132 |     <td>"What is the population of Chicago?"</td>
133 |     <td>"What is the population of Chicago?"</td>
134 |     <td>vector retrieval</td>
135 |     <td>Chicago</td>
136 |     </tr>
137 |     <tr>
138 |     <td>"Give me a summary of the positive aspects of Atlanta."</td>
139 |     <td>"Give me a summary of the positive aspects of Atlanta."</td>
140 |     <td>summary retrieval</td>
141 |     <td>Atlanta</td>
142 |     </tr>
143 |     <tr>
144 |     <td rowspan=5>"Which city has the highest population?"</td>
145 |     <td>"What is the population of Toronto?"</td>
146 |     <td>vector retrieval</td>
147 |     <td>Toronto</td>
148 |     </tr>
149 |     <tr>
150 |     <td>"What is the population of Chicago?"</td>
151 |     <td>vector retrieval</td>
152 |     <td>Chicago</td>
153 |     </tr>
154 |     <tr>
155 |     <td>"What is the population of Houston?"</td>
156 |     <td>vector retrieval</td>
157 |     <td>Houston</td>
158 |     </tr>
159 |     <tr>
160 |     <td>"What is the population of Boston?"</td>
161 |     <td>vector retrieval</td>
162 |     <td>Boston</td>
163 |     </tr>
164 |     <tr>
165 |     <td>"What is the population of Atlanta?"</td>
166 |     <td>vector retrieval</td>
167 |     <td>Atlanta</td>
168 |     </tr>
169 | </tbody>
170 | </table>
171 | </details>
172 | 
173 | ### Task 2: Vector/Summary Retrieval
174 | 
175 | For each sub-question, we use the chosen retrieval function over the corresponding data source to retrieve the relevant information. For example, for the sub-question *"What is the population of Chicago?"*, we use vector retrieval over the Chicago data source. Similarly, for the sub-question *"Give me a summary of the positive aspects of Atlanta."*, we use summary retrieval over the Atlanta data source.
176 | 
177 | For both retrieval methods, we use the same LLM prompt template. In fact, we find that the popular **RAG Prompt** from [LangchainHub](https://smith.langchain.com/hub) works great out-of-the-box for this step.
178 | 
179 | ```
180 | -- RAG Prompt Template --
181 | 
182 | """
183 | You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
184 | Question: {question}
185 | Context: {context}
186 | Answer:
187 | ```
188 | 
189 | Both the retrieval methods only differ in the context used for the LLM call. For vector retrieval, we use the top K most similar data chunks to the sub-question as context. For summary retrieval, we use the entire data source as context.
190 | 
191 | ![task_2_table](images/task_2_table.png)
192 | 
193 | ### Task 3: Response Aggregation
194 | 
195 | This is the final step that aggregates the responses from the sub-questions into a final response. For example, for the question *"Which city has the highest population?"*, the sub-questions retrieve the population of each city and then response aggregation finds and returns the city with the highest population.
196 | The **RAG Prompt** works great for this step as well.
197 | 
198 | The context for the LLM call is the list of responses from the sub-questions. The question is the original user question and the LLM outputs a final response.
199 | 
200 | ![task_3_table](images/task_3_table.png)
201 | 
202 | ### Putting it all together
203 | 
204 | After unraveling the layers of abstraction, we uncovered the secret ingredient powering the sub-question query engine - 4 types of LLM calls each with different prompt template, context, and a question. This fits the universal input pattern that we identified earlier perfectly, and is a far cry from the complex abstractions that we started with.
205 | To summarize:
206 | ![equation](images/equation.png)
207 | ![call_types_table](images/call_types_table.png)
208 | 
209 | To see the full pipeline in action, run the following commands:
210 | 
211 | ```
212 | pip install -r requirements.txt
213 | 
214 | echo OPENAI_API_KEY='yourkey' > .env
215 | python complex_qa.py
216 | ```
217 | 
218 | Here is an example of the system answering the question *"Which city with the highest population?"*.
219 | 
220 | ![full_pipeline](images/simple_rag.png)
221 | 
222 | ## Challenges
223 | 
224 | Now that we've demystified the inner workings of advanced RAG pipelines, let's examine the challenges associated with them.
225 | 
226 | 1. **Question sensitivity** - The biggest challenge that we observed with these systems is the question sensitivity. The LLMs are extremely sensitive to the user question, and the pipeline fails unexpectedly for several user questions. Here are a few example failure cases that we encountered:
227 |     - **Incorrect sub-questions** - The LLM sometimes generates incorrect sub-questions. For example, *"Which city has the highest number of tech companies?"* is broken down into *"What are the tech companies in each city?"* 5 times (once for each city) instead of *"What is the number of tech companies in Toronto?"*, *"What is the number of tech companies in Chicago?"*, etc.
228 |     - **Incorrect retrieval function** - *"Summarize the positive aspects of Atlanta and Toronto."* results in using the vector retrieval function instead of the summary retrieval method.
229 | 
230 | We had to put in significant effort into prompt engineering to get the pipeline to work for each question. This is a significant challenge for building robust systems.
231 | 
232 | To verify this behavior, we [implemented the example](llama_index_baseline.py) using the LlamaIndex Sub-question query engine. Consistent with our observations, the system often generates the wrong sub-questions and also uses the wrong retrieval function for the sub-questions, as shown below.
233 | 
234 | ![llama_index_baseline](images/baseline.png)
235 | 
236 | 
237 | 2. **Cost** - The second challenge is the cost dynamics of advanced RAG pipelines. The issue is two-fold:
238 |     - **Cost sensitivity** - The final cost of the question is dependent on the number of sub-questions generated, the retrieval function used, and the number of data sources queried. Since the LLMs are sensitive to the prompt, the cost of the question can vary significantly depending on the question and the LLM output. For example, the incorrect model choice in the LlamaIndex baseline example above (`summary_tool`) results in a 3x higher cost compared to the `vector_tool` while also generating an incorrect response.
239 |     - **Cost estimation** - Advanced abstractions in RAG frameworks obscure the estimated cost of the question. Setting up a cost monitoring system is challenging since the cost of the question is dependent on the LLM output.
240 | 
241 | 
242 | ## Conclusion
243 | 
244 | Advanced RAG pipelines powered by LLMs have revolutionized question-answering systems.
245 | However, as we have seen, these pipelines are not turnkey solutions. Under the hood, they rely on carefully engineered prompt templates and multiple chained LLM calls. As illustrated in this [EvaDB](https://github.com/georgia-tech-db/evadb) application, these pipelines can be question-sensitive, brittle, and opaque in their cost dynamics. Understanding these intricacies is key to leveraging their full potential and paving the way for more robust and efficient systems in the future.
246 | 
247 | 
248 | <!-- ## Appendix
249 | 
250 | 
251 | To reliably generate the correct format of functions and data sources, we use the powerful [OpenAI function calling](https://openai.com/blog/function-calling-and-other-api-updates) feature paired with Pydantic models. We also use the [Instructor](https://github.com/jxnl/instructor) library to easily generate LLM-ready function schemas.
252 | 
253 | More details on the full schema definition can be found [here](subquestion_generator.py).
254 | 
255 | For example, the function schema to choose vector/summary retrieval is as simple as:
256 | 
257 | ```python
258 | class FunctionEnum(str, Enum):
259 |     """The function to use to answer the questions.
260 |     Use vector_retrieval for factoid questions.
261 |     Use summary_retrieval for summarization questions.
262 |     """
263 |     VECTOR_RETRIEVAL = "vector_retrieval"
264 |     SUMMARY_RETRIEVAL = "summary_retrieval"
265 | ```
266 | 
267 | The data source schema definition is also straightforward:
268 | ```python
269 | class DataSourceEnum(str, Enum):
270 |     """The data source to use to answer the corresponding subquestion"""
271 |     TORONTO = "Toronto"
272 |     CHICAGO = "Chicago"
273 |     HOUSTON = "Houston"
274 |     BOSTON = "Boston"
275 |     ATLANTA = "Atlanta"
276 | ```
277 | 
278 | All of this can be packaged into a simple Pydantic model:
279 | 
280 | ```python
281 | class QuestionBundle(BaseModel):
282 |     question: str = Field(None, description="The subquestion extracted from the user's question")
283 |     function: FunctionEnum
284 |     data_source: DataSourceEnum
285 | ```
286 | 
287 | Using the Instructor library, we can provide the above schema as the desired output format to OpenAI.
288 | ```python
289 | from instructor import OpenAISchema
290 | 
291 | class SubQuestionBundleList(OpenAISchema):
292 |     subquestion_bundle_list: List[QuestionBundle] = Field(None, description="A list of subquestions - each item in the list contains a question, a function, and a data source")
293 | 
294 | response = openai.ChatCompletion.create(
295 |         model="gpt-3.5-turbo",
296 |         functions=[QuestionBundle.OpenAISchema],
297 |         ...
298 | )
299 | ``` -->
300 | 


--------------------------------------------------------------------------------
/complex_qa.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from dotenv import load_dotenv
  3 | from pathlib import Path
  4 | import requests
  5 | 
  6 | import warnings
  7 | warnings.filterwarnings("ignore")
  8 | 
  9 | from subquestion_generator import generate_subquestions
 10 | import evadb
 11 | from openai_utils import llm_call
 12 | 
 13 | 
 14 | if not load_dotenv():
 15 |     print(
 16 |         "Could not load .env file or it is empty. Please check if it exists and is readable."
 17 |     )
 18 |     exit(1)
 19 | 
 20 | 
 21 | def generate_vector_stores(cursor, docs):
 22 |     """Generate a vector store for the docs using evadb.
 23 |     """
 24 |     for doc in docs:
 25 |         print(f"Creating vector store for {doc}...")
 26 |         cursor.query(f"DROP TABLE IF EXISTS {doc};").df()
 27 |         cursor.query(f"LOAD DOCUMENT 'data/{doc}.txt' INTO {doc};").df()
 28 |         evadb_path = os.path.dirname(evadb.__file__)
 29 |         cursor.query(
 30 |             f"""CREATE FUNCTION IF NOT EXISTS SentenceFeatureExtractor
 31 |             IMPL  '{evadb_path}/functions/sentence_feature_extractor.py';
 32 |             """).df()
 33 | 
 34 |         cursor.query(
 35 |             f"""CREATE TABLE IF NOT EXISTS {doc}_features AS
 36 |             SELECT SentenceFeatureExtractor(data), data FROM {doc};"""
 37 |         ).df()
 38 | 
 39 |         cursor.query(
 40 |             f"CREATE INDEX IF NOT EXISTS {doc}_index ON {doc}_features (features) USING FAISS;"
 41 |         ).df()
 42 |         print(f"Successfully created vector store for {doc}.")
 43 | 
 44 | 
 45 | def vector_retrieval(cursor, llm_model, question, doc_name):
 46 |     """Returns the answer to a factoid question using vector retrieval.
 47 |     """
 48 |     res_batch = cursor.query(
 49 |         f"""SELECT data FROM {doc_name}_features
 50 |         ORDER BY Similarity(SentenceFeatureExtractor('{question}'),features)
 51 |         LIMIT 3;"""
 52 |     ).df()
 53 |     context_list = []
 54 |     for i in range(len(res_batch)):
 55 |         context_list.append(res_batch["data"][i])
 56 |     context = "\n".join(context_list)
 57 |     user_prompt = f"""You are an assistant for question-answering tasks.
 58 |                 Use the following pieces of retrieved context to answer the question.
 59 |                 If you don't know the answer, just say that you don't know.
 60 |                 Use three sentences maximum and keep the answer concise.
 61 |                 Question: {question}
 62 |                 Context: {context}
 63 |                 Answer:"""
 64 | 
 65 |     response, cost = llm_call(model=llm_model, user_prompt=user_prompt)
 66 | 
 67 |     answer = response.choices[0].message.content
 68 |     return answer, cost
 69 | 
 70 | 
 71 | def summary_retrieval(llm_model, question, doc):
 72 |     """Returns the answer to a summarization question over the document using summary retrieval.
 73 |     """
 74 |     # context_length = OPENAI_MODEL_CONTEXT_LENGTH[llm_model]
 75 |     # total_tokens = get_num_tokens_simple(llm_model, wiki_docs[doc])
 76 |     user_prompt = f"""Here is some context: {doc}
 77 |                 Use only the provided context to answer the question.
 78 |                 Here is the question: {question}"""
 79 | 
 80 |     response, cost = llm_call(model=llm_model, user_prompt=user_prompt)
 81 |     answer = response.choices[0].message.content
 82 |     return answer, cost
 83 |     # load max of context_length tokens from the document
 84 | 
 85 | 
 86 | def response_aggregator(llm_model, question, responses):
 87 |     """Aggregates the responses from the subquestions to generate the final response.
 88 |     """
 89 |     print("-------> ⭐ Aggregating responses...")
 90 |     system_prompt = """You are an assistant for question-answering tasks.
 91 |                 Use the following pieces of retrieved context to answer the question.
 92 |                 If you don't know the answer, just say that you don't know.
 93 |                 Use three sentences maximum and keep the answer concise."""
 94 | 
 95 |     context = ""
 96 |     for i, response in enumerate(responses):
 97 |         context += f"\n{response}"
 98 | 
 99 |     user_prompt = f"""Question: {question}
100 |                       Context: {context}
101 |                       Answer:"""
102 | 
103 |     response, cost = llm_call(model=llm_model, system_prompt=system_prompt, user_prompt=user_prompt)
104 |     answer = response.choices[0].message.content
105 |     return answer, cost
106 | 
107 | 
108 | def load_wiki_pages(page_titles=["Toronto", "Chicago", "Houston", "Boston", "Atlanta"]):
109 | 
110 |     # Download all wiki documents
111 |     for title in page_titles:
112 |         response = requests.get(
113 |             "https://en.wikipedia.org/w/api.php",
114 |             params={
115 |                 "action": "query",
116 |                 "format": "json",
117 |                 "titles": title,
118 |                 "prop": "extracts",
119 |                 # 'exintro': True,
120 |                 "explaintext": True,
121 |             },
122 |         ).json()
123 |         page = next(iter(response["query"]["pages"].values()))
124 |         wiki_text = page["extract"]
125 | 
126 |         data_path = Path("data")
127 |         if not data_path.exists():
128 |             Path.mkdir(data_path)
129 | 
130 |         with open(data_path / f"{title}.txt", "w") as fp:
131 |             fp.write(wiki_text)
132 | 
133 |     # Load all wiki documents
134 |     city_docs = {}
135 |     for wiki_title in page_titles:
136 |         input_text = open(f"data/{wiki_title}.txt", "r").read()
137 |         city_docs[wiki_title] = input_text[:10000]
138 |     return city_docs
139 | 
140 | 
141 | if __name__ == "__main__":
142 | 
143 |     # establish evadb api cursor
144 |     print("⏳ Connect to EvaDB...")
145 |     cursor = evadb.connect().cursor()
146 |     print("✅ Connected to EvaDB...")
147 | 
148 |     doc_names = ["Toronto", "Chicago", "Houston", "Boston", "Atlanta"]
149 |     wiki_docs = load_wiki_pages(page_titles=doc_names)
150 | 
151 |     question = "Which city has the highest population?"
152 | 
153 |     user_task = """We have a database of wikipedia articles about several cities.
154 |                  We are building an application to answer questions about the cities."""
155 | 
156 |     vector_stores = generate_vector_stores(cursor, wiki_docs)
157 | 
158 |     llm_model = "gpt-3.5-turbo"
159 |     total_cost = 0
160 |     while True:
161 |         question_cost = 0
162 |         # Get question from user
163 |         question = str(input("Question (enter 'exit' to exit): "))
164 |         if question.lower() == "exit":
165 |             break
166 |         print("🧠 Generating subquestions...")
167 |         subquestions_bundle_list, cost = generate_subquestions(question=question,
168 |                                                                file_names=doc_names,
169 |                                                                user_task=user_task,
170 |                                                                llm_model=llm_model)
171 |         question_cost += cost
172 |         responses = []
173 |         for q_no, item in enumerate(subquestions_bundle_list):
174 |             subquestion = item.question
175 |             selected_func = item.function.value
176 |             selected_doc = item.file_name.value
177 |             print(f"\n-------> 🤔 Processing subquestion #{q_no+1}: {subquestion} | function: {selected_func} | data source: {selected_doc}")
178 |             if selected_func == "vector_retrieval":
179 |                 response, cost = vector_retrieval(cursor, llm_model, subquestion, selected_doc)
180 |             elif selected_func == "llm_retrieval":
181 |                 response, cost = summary_retrieval(llm_model, subquestion, wiki_docs[selected_doc])
182 |             else:
183 |                 print(f"\nCould not process subquestion: {subquestion} function: {selected_func} data source: {selected_doc}\n")
184 |                 exit(0)
185 |             print(f"✅ Response #{q_no+1}: {response}")
186 |             responses.append(response)
187 |             question_cost += cost
188 | 
189 |         aggregated_response, cost = response_aggregator(llm_model, question, responses)
190 |         question_cost += cost
191 |         print(f"\n✅ Final response: {aggregated_response}")
192 |         print(f"🤑 Total cost for the question: ${question_cost:.4f}")
193 |         total_cost += question_cost
194 | 
195 |     print(f"Total cost for all questions: ${total_cost:.4f}")
196 | 


--------------------------------------------------------------------------------
/images/baseline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pchunduri6/rag-demystified/e7b38d89ed5671675a9299a697e413483b75cfd6/images/baseline.png


--------------------------------------------------------------------------------
/images/call_types_table.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pchunduri6/rag-demystified/e7b38d89ed5671675a9299a697e413483b75cfd6/images/call_types_table.png


--------------------------------------------------------------------------------
/images/end-to-end-query-engine.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pchunduri6/rag-demystified/e7b38d89ed5671675a9299a697e413483b75cfd6/images/end-to-end-query-engine.png


--------------------------------------------------------------------------------
/images/equation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pchunduri6/rag-demystified/e7b38d89ed5671675a9299a697e413483b75cfd6/images/equation.png


--------------------------------------------------------------------------------
/images/intro.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pchunduri6/rag-demystified/e7b38d89ed5671675a9299a697e413483b75cfd6/images/intro.png


--------------------------------------------------------------------------------
/images/simple_rag.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pchunduri6/rag-demystified/e7b38d89ed5671675a9299a697e413483b75cfd6/images/simple_rag.png


--------------------------------------------------------------------------------
/images/task_1_table.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pchunduri6/rag-demystified/e7b38d89ed5671675a9299a697e413483b75cfd6/images/task_1_table.png


--------------------------------------------------------------------------------
/images/task_2_table.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pchunduri6/rag-demystified/e7b38d89ed5671675a9299a697e413483b75cfd6/images/task_2_table.png


--------------------------------------------------------------------------------
/images/task_3_table.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/pchunduri6/rag-demystified/e7b38d89ed5671675a9299a697e413483b75cfd6/images/task_3_table.png


--------------------------------------------------------------------------------
/llama_index_baseline.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | 
  3 | import requests
  4 | 
  5 | from llama_index import (
  6 |     VectorStoreIndex,
  7 |     SummaryIndex,
  8 |     SimpleKeywordTableIndex,
  9 |     SimpleDirectoryReader,
 10 |     ServiceContext,
 11 | )
 12 | from llama_index.schema import IndexNode
 13 | from llama_index.tools import QueryEngineTool, ToolMetadata
 14 | from llama_index.llms import OpenAI, AzureOpenAI
 15 | from llama_index.query_engine import SubQuestionQueryEngine
 16 | from llama_index.agent import OpenAIAgent
 17 | from llama_index.embeddings import HuggingFaceEmbedding, OpenAIEmbedding
 18 | from llama_index.callbacks import CallbackManager, TokenCountingHandler
 19 | from llama_index.response_synthesizers import get_response_synthesizer
 20 | import tiktoken
 21 | 
 22 | api_type = ""
 23 | api_base = ""
 24 | api_version = ""
 25 | api_key = ""
 26 | 
 27 | 
 28 | embed_model_name = "hugging_face"
 29 | 
 30 | if embed_model_name == "hugging_face":
 31 |     embed_model = HuggingFaceEmbedding(
 32 |         model_name="sentence-transformers/all-mpnet-base-v2", max_length=512
 33 |     )
 34 | elif embed_model_name == "text-embedding-ada-002":
 35 |     embed_model = OpenAIEmbedding(
 36 |         model="text-embedding-ada-002",
 37 |         deployment_name="text-embedding-ada-002",
 38 |         api_key=api_key,
 39 |         api_base=api_base,
 40 |         api_type=api_type,
 41 |         api_version=api_version,
 42 |     )
 43 | 
 44 | llm = AzureOpenAI(
 45 |     model="gpt-3.5-turbo",
 46 |     engine="gpt-35-turbo",
 47 |     api_key=api_key,
 48 |     api_base=api_base,
 49 |     api_type=api_type,
 50 |     api_version=api_version,
 51 | )
 52 | 
 53 | token_counter = TokenCountingHandler(
 54 |     tokenizer=tiktoken.encoding_for_model("gpt-3.5-turbo").encode
 55 | )
 56 | 
 57 | callback_manager = CallbackManager([token_counter])
 58 | 
 59 | service_context = ServiceContext.from_defaults(
 60 |     # system_prompt=system_prompt,
 61 |     llm=llm,
 62 |     callback_manager=callback_manager,
 63 |     embed_model=embed_model,
 64 | )
 65 | 
 66 | 
 67 | def print_token_count(token_counter, embed_model, model="gpt-35-turbo"):
 68 |     print(
 69 |         "Embedding Tokens: ",
 70 |         token_counter.total_embedding_token_count,
 71 |         "\n",
 72 |         "LLM Prompt Tokens: ",
 73 |         token_counter.prompt_llm_token_count,
 74 |         "\n",
 75 |         "LLM Completion Tokens: ",
 76 |         token_counter.completion_llm_token_count,
 77 |         "\n",
 78 |         "Total LLM Token Count: ",
 79 |         token_counter.total_llm_token_count,
 80 |         "\n",
 81 |     )
 82 |     pricing = {
 83 |         'gpt-35-turbo': {'prompt': 0.0015, 'completion': 0.002},
 84 |         'gpt-35-turbo-16k': {'prompt': 0.003, 'completion': 0.004},
 85 |         'gpt-4-0613': {'prompt': 0.03, 'completion': 0.06},
 86 |         'gpt-4-32k': {'prompt': 0.06, 'completion': 0.12},
 87 |         'embedding': {'hugging_face': 0, 'text-embedding-ada-002': 0.0001}
 88 |     }
 89 |     print(
 90 |         "Embedding Cost: ",
 91 |         pricing['embedding'][embed_model] * token_counter.total_embedding_token_count/1000,
 92 |         "\n",
 93 |         "LLM Prompt Cost: ",
 94 |         pricing[model]["prompt"] * token_counter.prompt_llm_token_count/1000,
 95 |         "\n",
 96 |         "LLM Completion Cost: ",
 97 |         pricing[model]["completion"] * token_counter.completion_llm_token_count/1000,
 98 |         "\n",
 99 |         "Total LLM Cost: ",
100 |         pricing[model]["prompt"] * token_counter.prompt_llm_token_count/1000 + pricing[model]["completion"] * token_counter.completion_llm_token_count/1000,
101 |         "\n",
102 |         "Total cost: ",
103 |         pricing['embedding'][embed_model] * token_counter.total_embedding_token_count/1000 + pricing[model]["prompt"] * token_counter.prompt_llm_token_count/1000 + pricing[model]["completion"] * token_counter.completion_llm_token_count/1000,
104 |     )
105 | 
106 | 
107 | if __name__ == "__main__":
108 |     wiki_titles = ["Toronto", "Chicago", "Houston", "Boston", "Atlanta"]
109 | 
110 |     for title in wiki_titles:
111 |         response = requests.get(
112 |             "https://en.wikipedia.org/w/api.php",
113 |             params={
114 |                 "action": "query",
115 |                 "format": "json",
116 |                 "titles": title,
117 |                 "prop": "extracts",
118 |                 # 'exintro': True,
119 |                 "explaintext": True,
120 |             },
121 |         ).json()
122 |         page = next(iter(response["query"]["pages"].values()))
123 |         wiki_text = page["extract"]
124 | 
125 |         data_path = Path("data")
126 |         if not data_path.exists():
127 |             Path.mkdir(data_path)
128 | 
129 |         with open(data_path / f"{title}.txt", "w") as fp:
130 |             fp.write(wiki_text)
131 | 
132 |     # Load all wiki documents
133 |     city_docs = {}
134 |     for wiki_title in wiki_titles:
135 |         city_docs[wiki_title] = SimpleDirectoryReader(
136 |             input_files=[f"data/{wiki_title}.txt"]
137 |         ).load_data()
138 | 
139 |     # # Build agents dictionary
140 |     # agents = {}
141 | 
142 |     query_engine_tools = []
143 |     for wiki_title in wiki_titles:
144 |         # build vector index
145 |         vector_index = VectorStoreIndex.from_documents(
146 |             city_docs[wiki_title], service_context=service_context
147 |         )
148 |         # build summary index
149 |         summary_index = SummaryIndex.from_documents(
150 |             city_docs[wiki_title], service_context=service_context
151 |         )
152 |         # define query engines
153 |         vector_query_engine = vector_index.as_query_engine()
154 |         list_query_engine = summary_index.as_query_engine()
155 | 
156 |         # define tools
157 |         query_engine_tools_per_doc = [
158 |             QueryEngineTool(
159 |                 query_engine=vector_query_engine,
160 |                 metadata=ToolMetadata(
161 |                     name=f"vector_tool_{wiki_title}",
162 |                     description="Useful for questions related to specific aspects of"
163 |                                 f" {wiki_title} (e.g. the history, arts and culture,"
164 |                                 " sports, demographics, or more).",
165 |                 ),
166 |             ),
167 |             QueryEngineTool(
168 |                 query_engine=list_query_engine,
169 |                 metadata=ToolMetadata(
170 |                     name=f"summary_tool_{wiki_title}",
171 |                     description="Useful for any requests that require a holistic summary"
172 |                                 f" of EVERYTHING about {wiki_title}. For questions about"
173 |                                 " more specific sections, please use the"
174 |                                 f" vector_tool_{wiki_title}.",
175 |                 ),
176 |             ),
177 |         ]
178 | 
179 |         query_engine_tools.extend(query_engine_tools_per_doc)
180 | 
181 |         # build agent
182 |         # function_llm = OpenAI(model="gpt-3.5-turbo-0613")
183 |         # agent = OpenAIAgent.from_tools(
184 |         #     query_engine_tools,
185 |         #     llm=llm,
186 |         #     verbose=True,
187 |         # )
188 | 
189 |         # agents[wiki_title] = agent
190 | 
191 |     response_synthesizer = get_response_synthesizer(
192 |         service_context=service_context,
193 |         response_mode="compact",
194 |     )
195 | 
196 |     sub_query_engine = SubQuestionQueryEngine.from_defaults(
197 |         query_engine_tools=query_engine_tools,
198 |         response_synthesizer=response_synthesizer,
199 |         service_context=service_context,
200 |         use_async=False,
201 |         verbose=True,
202 |     )
203 | 
204 |     question = "Which are the sports teams in Toronto?"
205 |     print("Question: ", question)
206 |     response = sub_query_engine.query(question)
207 |     print_token_count(token_counter, embed_model_name)
208 | 


--------------------------------------------------------------------------------
/openai_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import logging
 4 | 
 5 | from openai import OpenAI
 6 | client = OpenAI()
 7 | import tiktoken
 8 | 
 9 | from tenacity import (
10 |     retry,
11 |     stop_after_attempt,
12 |     wait_random_exponential,
13 |     after_log,
14 | )  # for exponential backoff
15 | 
16 | logging.basicConfig(stream=sys.stderr, level=logging.INFO)
17 | logger = logging.getLogger(__name__)
18 | 
19 | OPENAI_PRICING = {
20 |     "gpt-3.5-turbo": {"prompt": 0.0015, "completion": 0.002},
21 |     "gpt-3.5-turbo-16k": {"prompt": 0.003, "completion": 0.004},
22 |     "gpt-4": {"prompt": 0.03, "completion": 0.06},
23 |     "gpt-4-32k": {"prompt": 0.06, "completion": 0.12},
24 |     "embedding": {"hugging_face": 0, "text-embedding-ada-002": 0.0001},
25 | }
26 | 
27 | 
28 | OPENAI_MODEL_CONTEXT_LENGTH = {
29 |     "gpt-3.5-turbo": 4097,
30 |     "gpt-3.5-turbo-16k": 16385,
31 |     "gpt-4-0613": 8192,
32 |     "gpt-4-32k": 32768,
33 | }
34 | 
35 | 
36 | @retry(
37 |     wait=wait_random_exponential(min=1, max=60),
38 |     stop=stop_after_attempt(6),
39 |     after=after_log(logger, logging.INFO),
40 | )
41 | def completion_with_backoff(**kwargs):
42 |     return client.chat.completions.create(**kwargs)
43 | 
44 | 
45 | def llm_call_cost(response):
46 |     """Returns the cost of the LLM call in dollars"""
47 |     model = response.model
48 |     usage = response.usage
49 |     prompt_cost = OPENAI_PRICING[model]["prompt"]
50 |     completion_cost = OPENAI_PRICING[model]["completion"]
51 |     prompt_token_cost = (usage.prompt_tokens * prompt_cost) / 1000
52 |     completion_token_cost = (usage.completion_tokens * completion_cost) / 1000
53 |     return prompt_token_cost + completion_token_cost
54 | 
55 | 
56 | def llm_call(
57 |     model,
58 |     function_schema=None,
59 |     output_schema=None,
60 |     system_prompt="You are an AI assistant that answers user questions using the context provided.",
61 |     user_prompt="Please help me answer the following question:",
62 |     few_shot_examples=None,
63 | ):
64 |     kwargs = {}
65 |     if function_schema is not None:
66 |         kwargs["functions"] = function_schema
67 |     if output_schema is not None:
68 |         kwargs["function_call"] = output_schema
69 | 
70 |     messages = []
71 |     if system_prompt is not None:
72 |         messages.append({"role": "system", "content": system_prompt})
73 |     if few_shot_examples is not None:
74 |         messages.extend(few_shot_examples)
75 |     if user_prompt is not None:
76 |         messages.append({"role": "user", "content": user_prompt})
77 | 
78 |     response = completion_with_backoff(
79 |         model=model,
80 |         temperature=0,
81 |         messages=messages,
82 |         **kwargs
83 |     )
84 | 
85 |     # print cost of call
86 |     call_cost = llm_call_cost(response)
87 |     print(f"🤑 LLM call cost: ${call_cost:.4f}")
88 |     return response, call_cost
89 | 
90 | 
91 | def get_num_tokens_simple(model, prompt):
92 |     """Estimate the number of tokens in the prompt using tiktoken"""
93 |     encoding = tiktoken.encoding_for_model(model)
94 |     num_tokens = len(encoding.encode(prompt))
95 |     return num_tokens
96 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | evadb[document]
2 | openai>=1.0
3 | instructor
4 | pydantic==2.4.0
5 | python-dotenv==1.0.0
6 | tiktoken
7 | tenacity


--------------------------------------------------------------------------------
/subquestion_generator.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from typing import List
  3 | from enum import Enum
  4 | 
  5 | from instructor import OpenAISchema
  6 | from pydantic import Field, create_model
  7 | from openai_utils import llm_call
  8 | 
  9 | 
 10 | # DEFAULT_SUBQUESTION_GENERATOR_PROMPT = """
 11 | #                  You are an AI agent that takes a complex user question and returns a list of simple subquestions to answer the user's question.
 12 | #                  You are provided a set of functions and data sources that you can use to answer each subquestion.
 13 | #                  If the user question is simple, just return the user question, the function, and the data source to use.
 14 | #                  You can only use the provided functions and data sources.
 15 | #                  The subquestions should be complete questions that can be answered by a single function and a single data source.
 16 | #                  """
 17 | 
 18 | # DEFAULT_SUBQUESTION_GENERATOR_PROMPT = """
 19 | #     You are an AI assistant that specializes in breaking down complex questions into simpler, manageable sub-questions.
 20 | #     When presented with a complex user question, your role is to generate a list of sub-questions that, when answered, will comprehensively address the original query.
 21 | #     You have at your disposal a pre-defined set of functions and data sources to utilize in answering each sub-question.
 22 | #     If a user question is straightforward, your task is to return the original question, identifying the appropriate function and data source to use for its solution.
 23 | #     Please remember that you are limited to the provided functions and data sources, and that each sub-question should be a full question that can be answered using a single function and a single data source.
 24 | # """
 25 | 
 26 | DEFAULT_SUBQUESTION_GENERATOR_PROMPT = """
 27 |     You are an AI assistant that specializes in breaking down complex questions into simpler, manageable sub-questions.
 28 |     You have at your disposal a pre-defined set of functions and files to utilize in answering each sub-question.
 29 |     Please remember that your output should only contain the provided function names and file names, and that each sub-question should be a full question that can be answered using a single function and a single file.
 30 | """
 31 | 
 32 | DEFAULT_USER_TASK = ""
 33 | 
 34 | 
 35 | class FunctionEnum(str, Enum):
 36 |     """The function to use to answer the questions.
 37 |     Use vector_retrieval for fact-based questions such as demographics, sports, arts and culture, etc.
 38 |     Use llm_retrieval for summarization questions, such as positive aspects, history, etc.
 39 |     """
 40 | 
 41 |     VECTOR_RETRIEVAL = "vector_retrieval"
 42 |     LLM_RETRIEVAL = "llm_retrieval"
 43 | 
 44 | 
 45 | def generate_subquestions(
 46 |     question,
 47 |     file_names: List[str] = None,
 48 |     system_prompt=DEFAULT_SUBQUESTION_GENERATOR_PROMPT,
 49 |     user_task=DEFAULT_USER_TASK,
 50 |     llm_model="gpt-4-0613",
 51 | ):
 52 |     """Generates a list of subquestions from a user question along with the
 53 |     file name and the function to use to answer the question using OpenAI LLM.
 54 |     """
 55 |     FilenameEnum = Enum("FilenameEnum", {x.upper(): x for x in file_names})
 56 |     FilenameEnum.__doc__ = f"The names of the file to use to answer the corresponding subquestion - e.g. {file_names[0]}"
 57 | 
 58 |     # Create pydantic class dynamically
 59 |     QuestionBundle = create_model(
 60 |         "QuestionBundle",
 61 |         question=(
 62 |             str,
 63 |             Field(
 64 |                 None, description="The subquestion extracted from the user's question"
 65 |             ),
 66 |         ),
 67 |         function=(FunctionEnum, Field(None)),
 68 |         file_name=(FilenameEnum, Field(None)),
 69 |     )
 70 | 
 71 |     SubQuestionBundleList = create_model(
 72 |         "SubQuestionBundleList",
 73 |         subquestion_bundle_list=(
 74 |             List[QuestionBundle],
 75 |             Field(
 76 |                 None,
 77 |                 description="A list of subquestions - each item in the list contains a question, a function, and a file name",
 78 |             ),
 79 |         ),
 80 |         __base__=OpenAISchema,
 81 |     )
 82 | 
 83 |     user_prompt = f"{user_task}\n Here is the user question: {question}"
 84 | 
 85 |     few_shot_examples = [
 86 |         {
 87 |             "role": "user",
 88 |             "content": "Compare the population of Atlanta and Toronto?",
 89 |         },
 90 |         {
 91 |             "role": "function",
 92 |             "name": "SubQuestionBundleList",
 93 |             "content": """
 94 |             {
 95 |                 "subquestion_bundle_list": [
 96 |                     {
 97 |                         "question": "What is the population of Atlanta?",
 98 |                         "function": "vector_retrieval",
 99 |                         "file_name": "Atlanta"
100 |                     },
101 |                     {
102 |                         "question": "What is the population of Toronto?"
103 |                         "function": "vector_retrieval",
104 |                         "file_name": "Toronto"
105 |                     }
106 |                 ]
107 |             }""",
108 |         },
109 |         {
110 |             "role": "user",
111 |             "content": "Summarize the history of Chicago and Houston.",
112 |         },
113 |         {
114 |             "role": "function",
115 |             "name": "SubQuestionBundleList",
116 |             "content": """
117 |             {
118 |                 "subquestion_bundle_list": [
119 |                     {
120 |                         "question": "What is the history of Chicago?",
121 |                         "function": "llm_retrieval",
122 |                         "file_name": "Chicago"
123 |                     },
124 |                     {
125 |                         "question": "What is the history of Houston?",
126 |                         "function": "llm_retrieval",
127 |                         "file_name": "Houston"
128 |                     }
129 |                 ]
130 |             }""",
131 |         },
132 |     ]
133 | 
134 |     response, cost = llm_call(
135 |         model=llm_model,
136 |         function_schema=[SubQuestionBundleList.openai_schema],
137 |         output_schema={"name": SubQuestionBundleList.openai_schema["name"]},
138 |         system_prompt=system_prompt,
139 |         user_prompt=user_prompt,
140 |         few_shot_examples=few_shot_examples,
141 |     )
142 | 
143 |     subquestions_list = json.loads(response.choices[0].message.function_call.arguments)
144 | 
145 |     subquestions_pydantic_obj = SubQuestionBundleList(**subquestions_list)
146 |     subquestions_list = subquestions_pydantic_obj.subquestion_bundle_list
147 |     return subquestions_list, cost
148 | 


--------------------------------------------------------------------------------