├── .gitattributes ├── .gitignore ├── Assets └── Images │ ├── 3.1.png │ ├── 3.2.png │ ├── 3.3.png │ ├── 3.4.png │ ├── 3.5.png │ ├── 4.+1.png │ ├── 4.+2.png │ ├── 4.1.png │ ├── 4.2.png │ ├── 4.3.png │ ├── 5.1 1.png │ ├── 5.1.png │ ├── 6.1.png │ ├── 6.2.png │ ├── MEAP-HI 2.png │ ├── MEAP-HI.png │ ├── NewMEAP.png │ ├── NewMEAPFooter.png │ ├── NewMEAPHeader.png │ └── profile_s.png ├── Chapters ├── Chapter-03 │ └── indexing_pipeline.ipynb ├── Chapter-04 │ ├── generation_pipeline.ipynb │ └── xtra_tfidf_bm25_retriever.ipynb ├── Chapter-05 │ ├── evaluators.py │ └── rag_evaluations.ipynb ├── Chapter-06 │ └── advanced_rag.ipynb └── Readme.md ├── LICENSE ├── README.md ├── example_dot_env └── requirements.txt /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 105 | __pypackages__/ 106 | 107 | # Celery stuff 108 | celerybeat-schedule 109 | celerybeat.pid 110 | 111 | # SageMath parsed files 112 | *.sage.py 113 | 114 | # Environments 115 | .env 116 | .venv 117 | env/ 118 | venv/ 119 | ENV/ 120 | env.bak/ 121 | venv.bak/ 122 | 123 | # Spyder project settings 124 | .spyderproject 125 | .spyproject 126 | 127 | # Rope project settings 128 | .ropeproject 129 | 130 | # mkdocs documentation 131 | /site 132 | 133 | # mypy 134 | .mypy_cache/ 135 | .dmypy.json 136 | dmypy.json 137 | 138 | # Pyre type checker 139 | .pyre/ 140 | 141 | # pytype static type analyzer 142 | .pytype/ 143 | 144 | # Cython debug symbols 145 | cython_debug/ 146 | 147 | # PyCharm 148 | # JetBrains specific template is maintainted in a separate JetBrains.gitignore that can 149 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 150 | # and can be added to the global gitignore or merged into this file. For a more nuclear 151 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 152 | #.idea/ 153 | 154 | .myenv 155 | .env 156 | requirements 2.txt 157 | temp.ipynb 158 | Chapters/Chapter-05/xtra_benchmarking.ipynb 159 | xtra_benchmarking.ipynb 160 | *.faiss 161 | *.pkl 162 | 163 | -------------------------------------------------------------------------------- /Assets/Images/3.1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinav-kimothi/A-Simple-Guide-to-RAG/a827e1592ea45a3b0b7ee8ab25159819ce64b130/Assets/Images/3.1.png -------------------------------------------------------------------------------- /Assets/Images/3.2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinav-kimothi/A-Simple-Guide-to-RAG/a827e1592ea45a3b0b7ee8ab25159819ce64b130/Assets/Images/3.2.png -------------------------------------------------------------------------------- /Assets/Images/3.3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinav-kimothi/A-Simple-Guide-to-RAG/a827e1592ea45a3b0b7ee8ab25159819ce64b130/Assets/Images/3.3.png -------------------------------------------------------------------------------- /Assets/Images/3.4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinav-kimothi/A-Simple-Guide-to-RAG/a827e1592ea45a3b0b7ee8ab25159819ce64b130/Assets/Images/3.4.png -------------------------------------------------------------------------------- /Assets/Images/3.5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinav-kimothi/A-Simple-Guide-to-RAG/a827e1592ea45a3b0b7ee8ab25159819ce64b130/Assets/Images/3.5.png -------------------------------------------------------------------------------- /Assets/Images/4.+1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinav-kimothi/A-Simple-Guide-to-RAG/a827e1592ea45a3b0b7ee8ab25159819ce64b130/Assets/Images/4.+1.png -------------------------------------------------------------------------------- /Assets/Images/4.+2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinav-kimothi/A-Simple-Guide-to-RAG/a827e1592ea45a3b0b7ee8ab25159819ce64b130/Assets/Images/4.+2.png -------------------------------------------------------------------------------- /Assets/Images/4.1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinav-kimothi/A-Simple-Guide-to-RAG/a827e1592ea45a3b0b7ee8ab25159819ce64b130/Assets/Images/4.1.png -------------------------------------------------------------------------------- /Assets/Images/4.2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinav-kimothi/A-Simple-Guide-to-RAG/a827e1592ea45a3b0b7ee8ab25159819ce64b130/Assets/Images/4.2.png -------------------------------------------------------------------------------- /Assets/Images/4.3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinav-kimothi/A-Simple-Guide-to-RAG/a827e1592ea45a3b0b7ee8ab25159819ce64b130/Assets/Images/4.3.png -------------------------------------------------------------------------------- /Assets/Images/5.1 1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinav-kimothi/A-Simple-Guide-to-RAG/a827e1592ea45a3b0b7ee8ab25159819ce64b130/Assets/Images/5.1 1.png -------------------------------------------------------------------------------- /Assets/Images/5.1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinav-kimothi/A-Simple-Guide-to-RAG/a827e1592ea45a3b0b7ee8ab25159819ce64b130/Assets/Images/5.1.png -------------------------------------------------------------------------------- /Assets/Images/6.1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinav-kimothi/A-Simple-Guide-to-RAG/a827e1592ea45a3b0b7ee8ab25159819ce64b130/Assets/Images/6.1.png -------------------------------------------------------------------------------- /Assets/Images/6.2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinav-kimothi/A-Simple-Guide-to-RAG/a827e1592ea45a3b0b7ee8ab25159819ce64b130/Assets/Images/6.2.png -------------------------------------------------------------------------------- /Assets/Images/MEAP-HI 2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinav-kimothi/A-Simple-Guide-to-RAG/a827e1592ea45a3b0b7ee8ab25159819ce64b130/Assets/Images/MEAP-HI 2.png -------------------------------------------------------------------------------- /Assets/Images/MEAP-HI.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinav-kimothi/A-Simple-Guide-to-RAG/a827e1592ea45a3b0b7ee8ab25159819ce64b130/Assets/Images/MEAP-HI.png -------------------------------------------------------------------------------- /Assets/Images/NewMEAP.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinav-kimothi/A-Simple-Guide-to-RAG/a827e1592ea45a3b0b7ee8ab25159819ce64b130/Assets/Images/NewMEAP.png -------------------------------------------------------------------------------- /Assets/Images/NewMEAPFooter.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinav-kimothi/A-Simple-Guide-to-RAG/a827e1592ea45a3b0b7ee8ab25159819ce64b130/Assets/Images/NewMEAPFooter.png -------------------------------------------------------------------------------- /Assets/Images/NewMEAPHeader.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinav-kimothi/A-Simple-Guide-to-RAG/a827e1592ea45a3b0b7ee8ab25159819ce64b130/Assets/Images/NewMEAPHeader.png -------------------------------------------------------------------------------- /Assets/Images/profile_s.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abhinav-kimothi/A-Simple-Guide-to-RAG/a827e1592ea45a3b0b7ee8ab25159819ce64b130/Assets/Images/profile_s.png -------------------------------------------------------------------------------- /Chapters/Chapter-04/generation_pipeline.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "\n", 8 | " \"New\n", 9 | "\n" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "# Chapter 04 - Generation Pipeline: Generating Contextual LLM Responses" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "### Welcome to chapter 4 of A Simple Introduction to Retrieval Augmented Generation.\n", 24 | "\n", 25 | "In this chapter, we introduce the concepts behind the real-time generation pipeline that uses the knowledge base created by the indexing pipeline. This will complete the development of a simple RAG system.\n", 26 | "\n", 27 | "The generation pipeline consists of three steps -\n", 28 | "\n", 29 | "1. Retrieval\n", 30 | "2. Augmentation\n", 31 | "3. Generation" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": {}, 37 | "source": [ 38 | "" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "## Installing Dependencies" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "All the necessary libraries for running this notebook along with their versions can be found in __requirements.txt__ file in the root directory of this repository\n", 53 | "\n", 54 | "You should go to the root directory and run the following command to install the libraries\n", 55 | "\n", 56 | "```\n", 57 | "pip install -r requirements.txt\n", 58 | "```\n", 59 | "\n", 60 | "This is the recommended method of installing the dependencies\n", 61 | "\n", 62 | "___\n", 63 | "Alternatively, you can run the command from this notebook too. The relative path may vary" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 1, 69 | "metadata": {}, 70 | "outputs": [ 71 | { 72 | "name": "stdout", 73 | "output_type": "stream", 74 | "text": [ 75 | "\n", 76 | "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.2\u001b[0m\n", 77 | "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", 78 | "Note: you may need to restart the kernel to use updated packages.\n" 79 | ] 80 | } 81 | ], 82 | "source": [ 83 | "%pip install -r ../../requirements.txt --quiet" 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": {}, 89 | "source": [ 90 | "## 1. Load the Vector Index" 91 | ] 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "metadata": {}, 96 | "source": [ 97 | "In Chapter 3, we were working on indexing the Wikipedia page for the 2023 cricket world cup. If you recall we had used embeddings from OpenAI to encode the text and used FAISS as the vector index to store the embeddings. We also stored the FAISS index in a local directory. Let’s reuse this index" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": {}, 103 | "source": [ 104 | "Note: You will need an __OpenAI API Key__ which can be obtained from [OpenAI](https://platform.openai.com/api-keys) to reuse the embeddings." 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "To initialize the __OpenAI client__, we need to pass the api key. There are many ways of doing it. \n", 112 | "\n", 113 | "#### [Option 1] Creating a .env file for storing the API key and using it # Recommended\n", 114 | "\n", 115 | "Install the __dotenv__ library\n", 116 | "\n", 117 | "_The dotenv library is a popular tool used in various programming languages, including Python and Node.js, to manage environment variables in development and deployment environments. It allows developers to load environment variables from a .env file into their application's environment._\n", 118 | "\n", 119 | "- Create a file named .env in the root directory of their project.\n", 120 | "- Inside the .env file, then define environment variables in the format VARIABLE_NAME=value. \n", 121 | "\n", 122 | "e.g.\n", 123 | "\n", 124 | "OPENAI_API_KEY=YOUR API KEY" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 2, 130 | "metadata": {}, 131 | "outputs": [ 132 | { 133 | "name": "stdout", 134 | "output_type": "stream", 135 | "text": [ 136 | "Success: .env file found with some environment variables\n" 137 | ] 138 | } 139 | ], 140 | "source": [ 141 | "from dotenv import load_dotenv\n", 142 | "import os\n", 143 | "\n", 144 | "if load_dotenv():\n", 145 | " print(\"Success: .env file found with some environment variables\")\n", 146 | "else:\n", 147 | " print(\"Caution: No environment variables found. Please create .env file in the root directory or add environment variables in the .env file\")" 148 | ] 149 | }, 150 | { 151 | "cell_type": "markdown", 152 | "metadata": {}, 153 | "source": [ 154 | "#### [Option 2] Alternatively, you can set the API key in code. \n", 155 | "However, this is not recommended since it can leave your key exposed for potential misuse. Uncomment the cell below to use this method." 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 3, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "#import os\n", 165 | "# os.environ[\"OPENAI_API_KEY\"] = \"sk-proj-******\" #Imp : Replace with an OpenAI API Key" 166 | ] 167 | }, 168 | { 169 | "cell_type": "markdown", 170 | "metadata": {}, 171 | "source": [ 172 | "We can also test if the key is valid or not" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": 3, 178 | "metadata": {}, 179 | "outputs": [ 180 | { 181 | "name": "stdout", 182 | "output_type": "stream", 183 | "text": [ 184 | "OPENAI_API_KEY is set and is valid\n" 185 | ] 186 | } 187 | ], 188 | "source": [ 189 | "api_key=os.environ[\"OPENAI_API_KEY\"]\n", 190 | "\n", 191 | "from openai import OpenAI\n", 192 | "\n", 193 | "client = OpenAI()\n", 194 | "\n", 195 | "\n", 196 | "if api_key:\n", 197 | " try:\n", 198 | " client.models.list()\n", 199 | " print(\"OPENAI_API_KEY is set and is valid\")\n", 200 | " except openai.APIError as e:\n", 201 | " print(f\"OpenAI API returned an API Error: {e}\")\n", 202 | " pass\n", 203 | " except openai.APIConnectionError as e:\n", 204 | " print(f\"Failed to connect to OpenAI API: {e}\")\n", 205 | " pass\n", 206 | " except openai.RateLimitError as e:\n", 207 | " print(f\"OpenAI API request exceeded rate limit: {e}\")\n", 208 | " pass\n", 209 | "\n", 210 | "else:\n", 211 | " print(\"Please set you OpenAI API key as an environment variable OPENAI_API_KEY\")\n", 212 | "\n" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": 5, 218 | "metadata": {}, 219 | "outputs": [], 220 | "source": [ 221 | "# Import OpenAIEmbeddings from the library\n", 222 | "from langchain_openai import OpenAIEmbeddings\n", 223 | "\n", 224 | "# Instantiate the embeddings object\n", 225 | "embeddings=OpenAIEmbeddings(model=\"text-embedding-3-small\")\n", 226 | "\n", 227 | "# Import FAISS from langchain\n", 228 | "from langchain_community.vectorstores import FAISS\n", 229 | "\n", 230 | "# Load the FAISS vector store with safe deserialization\n", 231 | "vector_store = FAISS.load_local(folder_path=\"../../Assets/Data/\",index_name=\"CWC_index\", embeddings=embeddings, allow_dangerous_deserialization=True)\n" 232 | ] 233 | }, 234 | { 235 | "cell_type": "markdown", 236 | "metadata": {}, 237 | "source": [ 238 | "## 2. Retrieval" 239 | ] 240 | }, 241 | { 242 | "cell_type": "markdown", 243 | "metadata": {}, 244 | "source": [ 245 | "We will now retrieve a relevant passage from the knowledge base that is pertinent to our query - __\"Who won the World Cup final?\"__" 246 | ] 247 | }, 248 | { 249 | "cell_type": "markdown", 250 | "metadata": {}, 251 | "source": [ 252 | "" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": 6, 258 | "metadata": {}, 259 | "outputs": [ 260 | { 261 | "name": "stdout", 262 | "output_type": "stream", 263 | "text": [ 264 | " Retrieved Chunk 1: The tournament was contested by ten national teams, maintaining the same format\n", 265 | "used in 2019 . After six weeks of round-robin matches, India , South Africa , Australia , and\n", 266 | "New Zealand finished as the top four and qualified for the knockout stage. In the knockout stage,\n", 267 | "India and Australia beat New Zealand and South Africa, respectively, to advance to the final, played\n", 268 | "on 19 November at the Narendra Modi Stadium in Ahmedabad . Australia won the final by six\n", 269 | "wickets, winning their sixth Cricket World Cup title.\n", 270 | "\n", 271 | "\n", 272 | "\n", 273 | " Retrieved Chunk 2: The host India was the first team to qualify for the semi-finals after their\n", 274 | "302-run win against Sri Lanka , their seventh successive win in the World Cup. [ 42 ] India\n", 275 | "secured the top place amongst the semi-finalists after they beat South Africa by 243 runs on 5\n", 276 | "November at Eden Gardens in Kolkata . [ 43 ]\n", 277 | "\n", 278 | "\n", 279 | "\n" 280 | ] 281 | } 282 | ], 283 | "source": [ 284 | "# Define the query\n", 285 | "query = \"Who won the world cup?\"\n", 286 | "\n", 287 | "# Perform similarity search\n", 288 | "retrieved_docs = vector_store.similarity_search(query, k=2) # Get top 2 relevant chunks\n", 289 | "\n", 290 | "# Display results\n", 291 | "\n", 292 | "import textwrap\n", 293 | "\n", 294 | "for i, doc in enumerate(retrieved_docs):\n", 295 | " print(textwrap.fill(f\"\\nRetrieved Chunk {i+1}:\\n{doc.page_content}\",width=100))\n", 296 | " print(\"\\n\\n\")" 297 | ] 298 | }, 299 | { 300 | "cell_type": "markdown", 301 | "metadata": {}, 302 | "source": [ 303 | "This is the most basic implementation of a retriever in the generation pipeline of a RAG-enabled system. This method of retrieval is enabled by embeddings. We used the text-embedding-3-small from OpenAI. FAISS calculated the similarity score based on these embeddings" 304 | ] 305 | }, 306 | { 307 | "cell_type": "markdown", 308 | "metadata": {}, 309 | "source": [ 310 | "## 3. Augmentation" 311 | ] 312 | }, 313 | { 314 | "cell_type": "markdown", 315 | "metadata": {}, 316 | "source": [ 317 | "The information fetched by the retriever should also be sent to the LLM in form of a natural language prompt. This process of combining the user query and the retrieved information is called augmentation." 318 | ] 319 | }, 320 | { 321 | "cell_type": "markdown", 322 | "metadata": {}, 323 | "source": [ 324 | "" 325 | ] 326 | }, 327 | { 328 | "cell_type": "markdown", 329 | "metadata": {}, 330 | "source": [ 331 | "We will now execute augmentation with a simple contextual prompt with controlled generation." 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": 7, 337 | "metadata": {}, 338 | "outputs": [ 339 | { 340 | "name": "stdout", 341 | "output_type": "stream", 342 | "text": [ 343 | " Given the context below answer the question. Question: Who won the world cup? Context : The tournament was contested by ten national teams,\n", 344 | "maintaining the same format used in 2019 . After six weeks of round-robin matches, India , South Africa , Australia , and New Zealand finished\n", 345 | "as the top four and qualified for the knockout stage. In the knockout stage, India and Australia beat New Zealand and South Africa, respectively, to\n", 346 | "advance to the final, played on 19 November at the Narendra Modi Stadium in Ahmedabad . Australia won the final by six wickets, winning their sixth\n", 347 | "Cricket World Cup title.The host India was the first team to qualify for the semi-finals after their 302-run win against Sri Lanka , their seventh\n", 348 | "successive win in the World Cup. [ 42 ] India secured the top place amongst the semi-finalists after they beat South Africa by 243 runs on 5\n", 349 | "November at Eden Gardens in Kolkata . [ 43 ] Remember to answer only based on the context provided and not from any other source. If the\n", 350 | "question cannot be answered based on the provided context, say I don’t know.\n" 351 | ] 352 | } 353 | ], 354 | "source": [ 355 | "# taking first two retrieved documents\n", 356 | "retrieved_context=retrieved_docs[0].page_content + retrieved_docs[1].page_content\n", 357 | "\n", 358 | "# Creating the prompt\n", 359 | "augmented_prompt=f\"\"\"\n", 360 | "\n", 361 | "Given the context below answer the question.\n", 362 | "\n", 363 | "Question: {query} \n", 364 | "\n", 365 | "Context : {retrieved_context}\n", 366 | "\n", 367 | "Remember to answer only based on the context provided and not from any other source. \n", 368 | "\n", 369 | "If the question cannot be answered based on the provided context, say I don’t know.\n", 370 | "\n", 371 | "\"\"\"\n", 372 | "\n", 373 | "print(textwrap.fill(augmented_prompt,width=150))" 374 | ] 375 | }, 376 | { 377 | "cell_type": "markdown", 378 | "metadata": {}, 379 | "source": [ 380 | "## 4. Generation" 381 | ] 382 | }, 383 | { 384 | "cell_type": "markdown", 385 | "metadata": {}, 386 | "source": [ 387 | "Generation is the final step of this pipeline. While LLMs may be used in any of the previous steps in the pipeline, the generation step is completely reliant on the LLM. The most popular LLMs are the ones being developed by OpenAI, Anthropic, Meta, Google, Microsoft and Mistral amongst other developers. " 388 | ] 389 | }, 390 | { 391 | "cell_type": "markdown", 392 | "metadata": {}, 393 | "source": [ 394 | "We have built a simple retriever using FAISS and OpenAI embeddings and, we created a simple augmented prompt. Now we will use OpenAI’s latest model, GPT-4o-mini, to generate the response. To do this we will import the __ChatOpenAI__ library from langchain" 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": null, 400 | "metadata": {}, 401 | "outputs": [ 402 | { 403 | "name": "stdout", 404 | "output_type": "stream", 405 | "text": [ 406 | "Australia won the world cup.\n" 407 | ] 408 | } 409 | ], 410 | "source": [ 411 | "from langchain_openai import ChatOpenAI\n", 412 | "\n", 413 | "\n", 414 | "# Set up LLM \n", 415 | "llm = ChatOpenAI(\n", 416 | " model=\"gpt-4o-mini\",\n", 417 | " temperature=0,\n", 418 | " max_tokens=None,\n", 419 | " timeout=None,\n", 420 | " max_retries=2\n", 421 | ")\n", 422 | "\n", 423 | "messages=[(\"human\",augmented_prompt)]\n", 424 | "\n", 425 | "ai_msg = llm.invoke(messages)\n", 426 | "\n", 427 | "\n", 428 | "\n", 429 | "# Extract the answer from the response object\n", 430 | "answer=ai_msg.content\n", 431 | "\n", 432 | "print(answer)\n" 433 | ] 434 | }, 435 | { 436 | "cell_type": "markdown", 437 | "metadata": {}, 438 | "source": [ 439 | "# 5. RAG function" 440 | ] 441 | }, 442 | { 443 | "cell_type": "markdown", 444 | "metadata": {}, 445 | "source": [ 446 | "Let us build a function that will take three inputs - \n", 447 | "1. User Query\n", 448 | "2. Location of the Vector Index (Knowledge base)\n", 449 | "3. Index Name\n", 450 | "\n", 451 | "And generate an answer along with the retrieved documents" 452 | ] 453 | }, 454 | { 455 | "cell_type": "markdown", 456 | "metadata": {}, 457 | "source": [ 458 | "#### RAG function" 459 | ] 460 | }, 461 | { 462 | "cell_type": "code", 463 | "execution_count": 9, 464 | "metadata": {}, 465 | "outputs": [], 466 | "source": [ 467 | "import re\n", 468 | "\n", 469 | "# Function to clean text\n", 470 | "def clean_text(text):\n", 471 | " # Replace non-breaking space with regular space\n", 472 | " text = text.replace('\\xa0', ' ')\n", 473 | " \n", 474 | " # Remove any HTML tags (if any)\n", 475 | " text = re.sub(r'<[^>]+>', '', text) # Removes HTML tags\n", 476 | " \n", 477 | " # Remove references in brackets (e.g., [7], [39])\n", 478 | " text = re.sub(r'\\[.*?\\]', '', text) # Removes references inside square brackets\n", 479 | " \n", 480 | " # Remove extra spaces and newlines\n", 481 | " text = ' '.join(text.split()) # This will remove extra spaces and newline characters\n", 482 | " \n", 483 | " return text\n", 484 | "\n", 485 | "def rag_function(query, db_path, index_name):\n", 486 | " embeddings=OpenAIEmbeddings(model=\"text-embedding-3-small\")\n", 487 | "\n", 488 | " db=FAISS.load_local(folder_path=db_path, index_name=index_name, embeddings=embeddings, allow_dangerous_deserialization=True)\n", 489 | "\n", 490 | " retrieved_docs = db.similarity_search(query, k=2)\n", 491 | "\n", 492 | " retrieved_context=[clean_text(retrieved_docs[0].page_content + retrieved_docs[1].page_content)]\n", 493 | "\n", 494 | "\n", 495 | " augmented_prompt=f\"\"\"\n", 496 | "\n", 497 | " Given the context below answer the question.\n", 498 | "\n", 499 | " Question: {query} \n", 500 | "\n", 501 | " Context : {retrieved_context}\n", 502 | "\n", 503 | " Remember to answer only based on the context provided and not from any other source. \n", 504 | "\n", 505 | " If the question cannot be answered based on the provided context, say I don’t know.\n", 506 | "\n", 507 | " \"\"\"\n", 508 | "\n", 509 | " llm = ChatOpenAI(\n", 510 | " model=\"gpt-4o-mini\",\n", 511 | " temperature=0,\n", 512 | " max_tokens=None,\n", 513 | " timeout=None,\n", 514 | " max_retries=2\n", 515 | " )\n", 516 | "\n", 517 | " messages=[(\"human\",augmented_prompt)]\n", 518 | "\n", 519 | " ai_msg = llm.invoke(messages)\n", 520 | "\n", 521 | " response=ai_msg.content\n", 522 | "\n", 523 | " return retrieved_context, response\n", 524 | "\n" 525 | ] 526 | }, 527 | { 528 | "cell_type": "markdown", 529 | "metadata": {}, 530 | "source": [ 531 | "Let's try sending our question to this function." 532 | ] 533 | }, 534 | { 535 | "cell_type": "code", 536 | "execution_count": 10, 537 | "metadata": {}, 538 | "outputs": [ 539 | { 540 | "data": { 541 | "text/plain": [ 542 | "(['The tournament was contested by ten national teams, maintaining the same format used in 2019 . After six weeks of round-robin matches, India , South Africa , Australia , and New Zealand finished as the top four and qualified for the knockout stage. In the knockout stage, India and Australia beat New Zealand and South Africa, respectively, to advance to the final, played on 19 November at the Narendra Modi Stadium in Ahmedabad . Australia won the final by six wickets, winning their sixth Cricket World Cup title.The host India was the first team to qualify for the semi-finals after their 302-run win against Sri Lanka , their seventh successive win in the World Cup. India secured the top place amongst the semi-finalists after they beat South Africa by 243 runs on 5 November at Eden Gardens in Kolkata .'],\n", 543 | " 'Australia won the world cup.')" 544 | ] 545 | }, 546 | "execution_count": 10, 547 | "metadata": {}, 548 | "output_type": "execute_result" 549 | } 550 | ], 551 | "source": [ 552 | "rag_function(query=\"Who won the world cup?\", db_path=\"../../Assets/Data\", index_name=\"CWC_index\")" 553 | ] 554 | }, 555 | { 556 | "cell_type": "markdown", 557 | "metadata": {}, 558 | "source": [ 559 | "Let's ask another one." 560 | ] 561 | }, 562 | { 563 | "cell_type": "code", 564 | "execution_count": 11, 565 | "metadata": {}, 566 | "outputs": [ 567 | { 568 | "data": { 569 | "text/plain": [ 570 | "(['Virat Kohli was named the player of the tournament and also scored the most runs, while Mohammed Shami was the leading wicket-taker. A total of 1,250,307 spectators attended the matches, the highest number in any Cricket World Cup to date. The tournament final set viewership records in India, drawing 518 million viewers, with a peak of 57 million streaming viewers.The ICC announced its team of the tournament on 21 November 2023, with Virat Kohli being named as player of the tournament , and Rohit Sharma as captain of the team.'],\n", 571 | " 'Virat Kohli was named the player of the tournament and scored the most runs.')" 572 | ] 573 | }, 574 | "execution_count": 11, 575 | "metadata": {}, 576 | "output_type": "execute_result" 577 | } 578 | ], 579 | "source": [ 580 | "rag_function(\"What was Virat Kohli's achievement in the Cup?\",db_path=\"../../Assets/Data\", index_name=\"CWC_index\")" 581 | ] 582 | }, 583 | { 584 | "cell_type": "markdown", 585 | "metadata": {}, 586 | "source": [ 587 | "We can also ask a list of questions and see what the responses are" 588 | ] 589 | }, 590 | { 591 | "cell_type": "code", 592 | "execution_count": 13, 593 | "metadata": {}, 594 | "outputs": [], 595 | "source": [ 596 | "list_of_queries=['What was the outcome of the match between Australia and the Netherlands on 25 October 2023?',\n", 597 | " 'What ongoing cricket competition is currently taking place that involves multiple international teams?',\n", 598 | " 'What was the deadline for teams to finalize their 15-player squads for the 2023 Cricket World Cup?',\n", 599 | " \"What were the key highlights of the 2023 ICC Men's Cricket World Cup?\",\n", 600 | " 'What were the key outcomes of the 2023 Cricket World Cup, including the final match results and notable player statistics?',\n", 601 | " 'What years had Cricket World Cup finals and their host nations?',\n", 602 | " \"Which org has managed the Cricket World Cup since '75?\",\n", 603 | " \"What was India's winning margin vs. S. Africa on Nov 5, 2023?\",\n", 604 | " 'What teams qualified for the semi-finals in the 2023 Cricket World Cup?']" 605 | ] 606 | }, 607 | { 608 | "cell_type": "code", 609 | "execution_count": 15, 610 | "metadata": {}, 611 | "outputs": [ 612 | { 613 | "name": "stdout", 614 | "output_type": "stream", 615 | "text": [ 616 | "Query:What was the outcome of the match between Australia and the Netherlands on 25 October 2023?\n", 617 | "Response: I don’t know.\n", 618 | "\n", 619 | "Query:What ongoing cricket competition is currently taking place that involves multiple international teams?\n", 620 | "Response: I don’t know.\n", 621 | "\n", 622 | "Query:What was the deadline for teams to finalize their 15-player squads for the 2023 Cricket World Cup?\n", 623 | "Response: The deadline for teams to finalize their 15-player squads for the 2023 Cricket World Cup was 28 September.\n", 624 | "\n", 625 | "Query:What were the key highlights of the 2023 ICC Men's Cricket World Cup?\n", 626 | "Response: The key highlights of the 2023 ICC Men's Cricket World Cup include:\n", 627 | "\n", 628 | "- Dates: 5 October – 19 November 2023\n", 629 | "- Host: India (first time as the sole host)\n", 630 | "- Format: One Day International (ODI) with a round-robin and knockout tournament structure\n", 631 | "- Participants: 10 teams\n", 632 | "- Matches: 48 played\n", 633 | "- Attendance: 1,250,307 (average of 26,048 per match)\n", 634 | "- Champions: Australia (6th title)\n", 635 | "- Runners-up: India\n", 636 | "- Player of the Series: Virat Kohli\n", 637 | "- Most Runs: Virat Kohli (765 runs)\n", 638 | "- Most Wickets: Mohammed Shami (24 wickets)\n", 639 | "\n", 640 | "Query:What were the key outcomes of the 2023 Cricket World Cup, including the final match results and notable player statistics?\n", 641 | "Response: I don’t know.\n", 642 | "\n", 643 | "Query:What years had Cricket World Cup finals and their host nations?\n", 644 | "Response: The years that had Cricket World Cup finals and their host nations are as follows:\n", 645 | "\n", 646 | "- 1975: England\n", 647 | "- 1979: England\n", 648 | "- 1983: England / Wales\n", 649 | "- 1987: Australia / New Zealand\n", 650 | "- 1992: Pakistan / India / Sri Lanka\n", 651 | "- 1996: England / Scotland / Wales / Ireland / Netherlands\n", 652 | "- 1999: South Africa / Zimbabwe / Kenya\n", 653 | "- 2003: West Indies\n", 654 | "- 2007: India / Sri Lanka / Bangladesh\n", 655 | "- 2011: Australia / New Zealand\n", 656 | "- 2015: England / Wales\n", 657 | "- 2019: India\n", 658 | "- 2023: South Africa / Zimbabwe / Namibia\n", 659 | "\n", 660 | "Query:Which org has managed the Cricket World Cup since '75?\n", 661 | "Response: The organization that has managed the Cricket World Cup since 1975 is the International Cricket Council (ICC).\n", 662 | "\n", 663 | "Query:What was India's winning margin vs. S. Africa on Nov 5, 2023?\n", 664 | "Response: I don’t know.\n", 665 | "\n", 666 | "Query:What teams qualified for the semi-finals in the 2023 Cricket World Cup?\n", 667 | "Response: I don’t know.\n", 668 | "\n" 669 | ] 670 | } 671 | ], 672 | "source": [ 673 | "for query in list_of_queries:\n", 674 | " print(f\"Query:{query}\")\n", 675 | " print(f\"Response: {rag_function(query,db_path=\"../../Assets/Data\", index_name=\"CWC_index\")[1]}\\n\")\n" 676 | ] 677 | }, 678 | { 679 | "cell_type": "markdown", 680 | "metadata": {}, 681 | "source": [ 682 | "For some of the questions above, the response may be \"I don't know\". That is when the LLM can't find an answer in the retrieved context. In our augmentation step, we had asked the LLM to do so." 683 | ] 684 | }, 685 | { 686 | "cell_type": "markdown", 687 | "metadata": {}, 688 | "source": [ 689 | "---" 690 | ] 691 | }, 692 | { 693 | "cell_type": "markdown", 694 | "metadata": {}, 695 | "source": [ 696 | "Is the RAG system that we have created generating the responses on the expected lines? Is the LLM still hallucinating? Before trying to improve the performance of the system we need to be able to measure and benchmark it. That is what we will do in chapter 5. We will look at the evaluation metrics and the popular benchmarks for RAG." 697 | ] 698 | }, 699 | { 700 | "cell_type": "markdown", 701 | "metadata": {}, 702 | "source": [ 703 | "---" 704 | ] 705 | }, 706 | { 707 | "cell_type": "markdown", 708 | "metadata": {}, 709 | "source": [ 710 | " \n", 711 | "\n", 712 | "Hi! I'm Abhinav! I am an entrepreneur and Vice President of Artificial Intelligence at Yarnit. I have spent over 15 years consulting and leadership roles in data science, machine learning and AI. My current focus is in the applied Generative AI domain focussing on solving enterprise needs through contextual intelligence. I'm passionate about AI advancements constantly exploring emerging technologies to push the boundaries and create positive impacts in the world. Let’s build the future, together!\n", 713 | "\n", 714 | "[If you haven't already, please subscribe to the MEAP of A Simple Guide to Retrieval Augmented Generation here](https://mng.bz/8wdg)\n", 715 | "\n", 716 | "\n", 717 | " \"New\n", 718 | "\n", 719 | "\n", 720 | "#### If you'd like to chat, I'd be very happy to connect\n", 721 | "\n", 722 | "[![GitHub followers](https://img.shields.io/badge/Github-000000?style=for-the-badge&logo=github&logoColor=black&color=orange)](https://github.com/abhinav-kimothi)\n", 723 | "[![LinkedIn](https://img.shields.io/badge/LinkedIn-000000?style=for-the-badge&logo=linkedin&logoColor=orange&color=black)](https://www.linkedin.com/comm/mynetwork/discovery-see-all?usecase=PEOPLE_FOLLOWS&followMember=abhinav-kimothi)\n", 724 | "[![Medium](https://img.shields.io/badge/Medium-000000?style=for-the-badge&logo=medium&logoColor=black&color=orange)](https://medium.com/@abhinavkimothi)\n", 725 | "[![Insta](https://img.shields.io/badge/Instagram-000000?style=for-the-badge&logo=instagram&logoColor=orange&color=black)](https://www.instagram.com/akaiworks/)\n", 726 | "[![Mail](https://img.shields.io/badge/email-000000?style=for-the-badge&logo=gmail&logoColor=black&color=orange)](mailto:abhinav.kimothi.ds@gmail.com)\n", 727 | "[![X](https://img.shields.io/badge/Follow-000000?style=for-the-badge&logo=X&logoColor=orange&color=black)](https://twitter.com/abhinav_kimothi)\n", 728 | "[![Linktree](https://img.shields.io/badge/Linktree-000000?style=for-the-badge&logo=linktree&logoColor=black&color=orange)](https://linktr.ee/abhinavkimothi)\n", 729 | "[![Gumroad](https://img.shields.io/badge/Gumroad-000000?style=for-the-badge&logo=gumroad&logoColor=orange&color=black)](https://abhinavkimothi.gumroad.com/)\n", 730 | "\n", 731 | "---" 732 | ] 733 | } 734 | ], 735 | "metadata": { 736 | "kernelspec": { 737 | "display_name": ".sgragch4", 738 | "language": "python", 739 | "name": "python3" 740 | }, 741 | "language_info": { 742 | "codemirror_mode": { 743 | "name": "ipython", 744 | "version": 3 745 | }, 746 | "file_extension": ".py", 747 | "mimetype": "text/x-python", 748 | "name": "python", 749 | "nbconvert_exporter": "python", 750 | "pygments_lexer": "ipython3", 751 | "version": "3.13.2" 752 | } 753 | }, 754 | "nbformat": 4, 755 | "nbformat_minor": 2 756 | } 757 | -------------------------------------------------------------------------------- /Chapters/Chapter-04/xtra_tfidf_bm25_retriever.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "\n", 8 | " \"New\n", 9 | "\n" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "# Chapter 04 [Additional] - TFIDF & BM25 Retrievers" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "## Installing Dependencies\n", 24 | "\n", 25 | "All the necessary libraries for running this notebook along with their versions can be found in __requirements.txt__ file in the root directory of this repository\n", 26 | "\n", 27 | "You should go to the root directory and run the following command to install the libraries\n", 28 | "\n", 29 | "```\n", 30 | "pip install -r requirements.txt\n", 31 | "```\n", 32 | "\n", 33 | "This is the recommended method of installing the dependencies\n", 34 | "\n", 35 | "___\n", 36 | "Alternatively, you can run the command from this notebook too. The relative path may vary" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 1, 42 | "metadata": {}, 43 | "outputs": [ 44 | { 45 | "name": "stdout", 46 | "output_type": "stream", 47 | "text": [ 48 | "Note: you may need to restart the kernel to use updated packages.\n" 49 | ] 50 | } 51 | ], 52 | "source": [ 53 | "%pip install -r ../../requirements.txt --quiet" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 4, 59 | "metadata": {}, 60 | "outputs": [ 61 | { 62 | "name": "stdout", 63 | "output_type": "stream", 64 | "text": [ 65 | "Note: you may need to restart the kernel to use updated packages.\n", 66 | "Note: you may need to restart the kernel to use updated packages.\n", 67 | "Note: you may need to restart the kernel to use updated packages.\n", 68 | "Note: you may need to restart the kernel to use updated packages.\n", 69 | "Note: you may need to restart the kernel to use updated packages.\n" 70 | ] 71 | } 72 | ], 73 | "source": [ 74 | "%pip install --upgrade pip --quiet\n", 75 | "%pip install langchain==0.2.11 --quiet\n", 76 | "%pip install langchain-community==0.2.10 --quiet\n", 77 | "%pip install scikit-learn==1.4.2 --quiet\n", 78 | "%pip install rank_bm25==0.2.2 --quiet" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "## TF-IDF" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "TF-IDF is a statistical measure used to evaluate the importance of a word in a document relative to a collection of documents (corpus). It assigns higher weights to words that appear frequently in a document but infrequently across the corpus" 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 3, 105 | "metadata": {}, 106 | "outputs": [ 107 | { 108 | "name": "stdout", 109 | "output_type": "stream", 110 | "text": [ 111 | "Australia won the sixth time having last won in 2015\n" 112 | ] 113 | } 114 | ], 115 | "source": [ 116 | "# Import TFIDFRetriever class from retrievers library\n", 117 | "from langchain_community.retrievers import TFIDFRetriever\n", 118 | "\n", 119 | "# Create instance of the TFIDFRetriever with texts\n", 120 | "retriever = TFIDFRetriever.from_texts(\n", 121 | "[\"Australia won the Cricket World Cup 2023\",\n", 122 | " \"India and Australia played in the finals\",\n", 123 | " \"Australia won the sixth time having last won in 2015\"]\n", 124 | ")\n", 125 | "\n", 126 | "# Use the retriever using the invoke method\n", 127 | "result=retriever.invoke(\"won\")\n", 128 | "\n", 129 | "# Print the results\n", 130 | "print(result[0].page_content)\n" 131 | ] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "metadata": {}, 136 | "source": [ 137 | "## BM25 " 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": {}, 143 | "source": [ 144 | "BM25 is an advanced probabilistic model used to rank documents based on the query terms appearing in each document. It is part of the family of probabilistic information retrieval models and is considered an advancement over the classic TF-IDF model. The improvement that BM25 brings is that it adjusts for the length of the documents so that longer documents do not unfairly get higher scores. " 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "metadata": {}, 150 | "source": [ 151 | "" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": 5, 157 | "metadata": {}, 158 | "outputs": [ 159 | { 160 | "name": "stdout", 161 | "output_type": "stream", 162 | "text": [ 163 | "Australia won the Cricket World Cup 2023\n" 164 | ] 165 | } 166 | ], 167 | "source": [ 168 | "# Import BM25Retriever class from retrievers library\n", 169 | "from langchain_community.retrievers import BM25Retriever\n", 170 | "\n", 171 | "# Create instance of the TFIDFRetriever with texts\n", 172 | "retriever = BM25Retriever.from_texts(\n", 173 | "[\"Australia won the Cricket World Cup 2023\",\n", 174 | " \"India and Australia played in the finals\",\n", 175 | " \"Australia won the sixth time having last won in 2015\"]\n", 176 | ")\n", 177 | "\n", 178 | "# Use the retriever using the invoke method\n", 179 | "result=retriever.invoke(\"Who won the 2023 Cricket World Cup?\")\n", 180 | "\n", 181 | "# Print the results\n", 182 | "print(result[0].page_content)" 183 | ] 184 | }, 185 | { 186 | "cell_type": "markdown", 187 | "metadata": {}, 188 | "source": [ 189 | "---" 190 | ] 191 | }, 192 | { 193 | "cell_type": "markdown", 194 | "metadata": {}, 195 | "source": [ 196 | " \n", 197 | "\n", 198 | "Hi! I'm Abhinav! I am an entrepreneur and Vice President of Artificial Intelligence at Yarnit. I have spent over 15 years consulting and leadership roles in data science, machine learning and AI. My current focus is in the applied Generative AI domain focussing on solving enterprise needs through contextual intelligence. I'm passionate about AI advancements constantly exploring emerging technologies to push the boundaries and create positive impacts in the world. Let’s build the future, together!\n", 199 | "\n", 200 | "[If you haven't already, please subscribe to the MEAP of A Simple Guide to Retrieval Augmented Generation here](https://mng.bz/8wdg)\n", 201 | "\n", 202 | "\n", 203 | " \"New\n", 204 | "\n", 205 | "\n", 206 | "#### If you'd like to chat, I'd be very happy to connect\n", 207 | "\n", 208 | "[![GitHub followers](https://img.shields.io/badge/Github-000000?style=for-the-badge&logo=github&logoColor=black&color=orange)](https://github.com/abhinav-kimothi)\n", 209 | "[![LinkedIn](https://img.shields.io/badge/LinkedIn-000000?style=for-the-badge&logo=linkedin&logoColor=orange&color=black)](https://www.linkedin.com/comm/mynetwork/discovery-see-all?usecase=PEOPLE_FOLLOWS&followMember=abhinav-kimothi)\n", 210 | "[![Medium](https://img.shields.io/badge/Medium-000000?style=for-the-badge&logo=medium&logoColor=black&color=orange)](https://medium.com/@abhinavkimothi)\n", 211 | "[![Insta](https://img.shields.io/badge/Instagram-000000?style=for-the-badge&logo=instagram&logoColor=orange&color=black)](https://www.instagram.com/akaiworks/)\n", 212 | "[![Mail](https://img.shields.io/badge/email-000000?style=for-the-badge&logo=gmail&logoColor=black&color=orange)](mailto:abhinav.kimothi.ds@gmail.com)\n", 213 | "[![X](https://img.shields.io/badge/Follow-000000?style=for-the-badge&logo=X&logoColor=orange&color=black)](https://twitter.com/abhinav_kimothi)\n", 214 | "[![Linktree](https://img.shields.io/badge/Linktree-000000?style=for-the-badge&logo=linktree&logoColor=black&color=orange)](https://linktr.ee/abhinavkimothi)\n", 215 | "[![Gumroad](https://img.shields.io/badge/Gumroad-000000?style=for-the-badge&logo=gumroad&logoColor=orange&color=black)](https://abhinavkimothi.gumroad.com/)\n", 216 | "\n", 217 | "---" 218 | ] 219 | } 220 | ], 221 | "metadata": { 222 | "kernelspec": { 223 | "display_name": ".envch4ex", 224 | "language": "python", 225 | "name": "python3" 226 | }, 227 | "language_info": { 228 | "codemirror_mode": { 229 | "name": "ipython", 230 | "version": 3 231 | }, 232 | "file_extension": ".py", 233 | "mimetype": "text/x-python", 234 | "name": "python", 235 | "nbconvert_exporter": "python", 236 | "pygments_lexer": "ipython3", 237 | "version": "3.13.2" 238 | } 239 | }, 240 | "nbformat": 4, 241 | "nbformat_minor": 2 242 | } 243 | -------------------------------------------------------------------------------- /Chapters/Chapter-05/evaluators.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from langchain.evaluation import load_evaluator 4 | from langchain.smith import RunEvalConfig 5 | from langchain_openai import ChatOpenAI 6 | 7 | try: 8 | from langchain.schema.language_model import BaseLanguageModel 9 | except ImportError: 10 | from langchain_core.language_models import BaseLanguageModel 11 | from langsmith.evaluation.evaluator import EvaluationResult, RunEvaluator 12 | from langsmith.schemas import Example, Run 13 | 14 | 15 | # TODO: Split this into an assertion-by-assertion evaluator 16 | # TODO: Combine with a document relevance evaluator (to report retriever performance) 17 | class FaithfulnessEvaluator(RunEvaluator): 18 | def __init__(self, llm: Optional[BaseLanguageModel] = None): 19 | self.evaluator = load_evaluator( 20 | "labeled_score_string", 21 | criteria={ 22 | "faithfulness": """ 23 | Score 1: The answer directly contradicts the information provided in the reference docs. 24 | Score 3: The answer contains a mix of correct information from the reference docs and incorrect or unverifiable information not found in the docs. 25 | Score 5: The answer is mostly aligned with the reference docs but includes extra information that, while not contradictory, is not verified by the docs. 26 | Score 7: The answer aligns well with the reference docs but includes minor, commonly accepted facts not found in the docs. 27 | Score 10: The answer perfectly aligns with and is fully entailed by the reference docs, with no extra information.""" 28 | }, 29 | llm=llm, 30 | normalize_by=10, 31 | ) 32 | 33 | @staticmethod 34 | def _get_retrieved_docs(run: Run) -> str: 35 | # This assumes there is only one retriever in your chain. 36 | # To select more precisely, name your retrieval chain 37 | # using with_config(name="my_unique_name") and look up 38 | # by run.name 39 | runs = [run] 40 | while runs: 41 | run = runs.pop() 42 | if run.run_type == "retriever": 43 | return str(run.outputs["documents"]) 44 | if run.child_runs: 45 | runs.extend(run.child_runs[::-1]) 46 | return "" 47 | 48 | def evaluate_run( 49 | self, run: Run, example: Optional[Example] = None 50 | ) -> EvaluationResult: 51 | try: 52 | docs_string = self._get_retrieved_docs(run) 53 | docs_string = f"Reference docs:\n\n{docs_string}\n\n\n" 54 | print(f"\n{docs_string[10]}\n") 55 | input_query = run.inputs["Question"] 56 | print(f"\nInput Query={input_query}\n") 57 | if run.outputs is not None and len(run.outputs) == 1: 58 | prediction = next(iter(run.outputs.values())) 59 | print(f"\nPrediction={prediction}\n") 60 | else: 61 | prediction = run.outputs["output"] 62 | print(f"\nPrediction={prediction}\n") 63 | result = self.evaluator.evaluate_strings( 64 | input=input_query, 65 | prediction=prediction, 66 | reference=docs_string, 67 | ) 68 | return EvaluationResult( 69 | **{"key": "faithfulness", "comment": result.get("reasoning"), **result} 70 | ) 71 | except Exception as e: 72 | return EvaluationResult(key="faithfulness", score=None, comment=repr(e)) 73 | 74 | 75 | _ACCURACY_CRITERION = { 76 | "accuracy": """ 77 | Score 1: The answer is incorrect and unrelated to the question or reference document. 78 | Score 3: The answer shows slight relevance to the question or reference document but is largely incorrect. 79 | Score 5: The answer is partially correct but has significant errors or omissions. 80 | Score 7: The answer is mostly correct with minor errors or omissions, and aligns with the reference document. 81 | Score 10: The answer is correct, complete, and perfectly aligns with the reference document. 82 | 83 | If the reference answer contains multiple alternatives, the predicted answer must only match one of the alternatives to be considered correct. 84 | If the predicted answer contains additional helpful and accurate information that is not present in the reference answer, it should still be considered correct. 85 | """ # noqa 86 | } 87 | 88 | 89 | def get_eval_config() -> RunEvalConfig: 90 | """Returns the evaluator for the environment.""" 91 | eval_llm = ChatOpenAI( 92 | model="gpt-4o-mini", 93 | temperature=0.0, 94 | seed=42, 95 | max_retries=1, 96 | request_timeout=60, 97 | ) 98 | # Use a longer-context LLM to check documents 99 | faithfulness_eval_llm = ChatOpenAI( 100 | model="gpt-4o-mini", 101 | temperature=0.0, 102 | seed=42, 103 | max_retries=1, 104 | request_timeout=60, 105 | ) 106 | 107 | return RunEvalConfig( 108 | evaluators=[ 109 | RunEvalConfig.LabeledScoreString( 110 | criteria=_ACCURACY_CRITERION, llm=eval_llm, normalize_by=10.0 111 | ), 112 | RunEvalConfig.EmbeddingDistance(), 113 | ], 114 | custom_evaluators=[FaithfulnessEvaluator(llm=faithfulness_eval_llm)], 115 | ) 116 | -------------------------------------------------------------------------------- /Chapters/Chapter-05/rag_evaluations.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "\n", 8 | " \"New\n", 9 | "\n" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "# Chapter 05 - RAG Evaluation: Accuracy, Relevance, Faithfulness" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "### Welcome to chapter 5 of A Simple Introduction to Retrieval Augmented Generation.\n", 24 | "\n", 25 | "In this chapter, we will assess the quality of the RAG pipeline we have built in Chapter 3 & 4. We will re-use the [knowledge base](../../Assets/Data/) we created with the Wikipedia article. We will reuse the Retrieval Augmentation and Generation functions we built in Chapter 4." 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "## Installing Dependencies" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "All the necessary libraries for running this notebook along with their versions can be found in __requirements.txt__ file in the root directory of this repository\n", 40 | "\n", 41 | "You should go to the root directory and run the following command to install the libraries\n", 42 | "\n", 43 | "```\n", 44 | "pip install -r requirements.txt\n", 45 | "```\n", 46 | "\n", 47 | "This is the recommended method of installing the dependencies\n", 48 | "\n", 49 | "___\n", 50 | "Alternatively, you can run the command from this notebook too. The relative path may vary" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 1, 56 | "metadata": {}, 57 | "outputs": [ 58 | { 59 | "name": "stdout", 60 | "output_type": "stream", 61 | "text": [ 62 | "\n", 63 | "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m25.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.0.1\u001b[0m\n", 64 | "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", 65 | "Note: you may need to restart the kernel to use updated packages.\n" 66 | ] 67 | } 68 | ], 69 | "source": [ 70 | "%pip install -r ../../requriements.txt --quiet" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": {}, 76 | "source": [ 77 | "## 1. Re-Load the RAG Pipeline" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [ 84 | "In chapter 4, we created the generation pipeline. We will bring that here to use it for evaluations.\n", 85 | "\n", 86 | "In Chapter 3, we were working on indexing the Wikipedia page for the 2023 cricket world cup. If you recall we had used embeddings from OpenAI to encode the text and used FAISS as the vector index to store the embeddings. We also stored the FAISS index in a local directory. We will use this in the RAG pipeline." 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": {}, 92 | "source": [ 93 | "Note: You will need an __OpenAI API Key__ which can be obtained from [OpenAI](https://platform.openai.com/api-keys) to reuse the embeddings." 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": {}, 99 | "source": [ 100 | "To initialize the __OpenAI client__, we need to pass the api key. There are many ways of doing it. \n", 101 | "\n", 102 | "#### [Option 1] Creating a .env file for storing the API key and using it # Recommended\n", 103 | "\n", 104 | "Install the __dotenv__ library\n", 105 | "\n", 106 | "_The dotenv library is a popular tool used in various programming languages, including Python and Node.js, to manage environment variables in development and deployment environments. It allows developers to load environment variables from a .env file into their application's environment._\n", 107 | "\n", 108 | "- Create a file named .env in the root directory of their project.\n", 109 | "- Inside the .env file, then define environment variables in the format VARIABLE_NAME=value. \n", 110 | "\n", 111 | "e.g.\n", 112 | "\n", 113 | "OPENAI_API_KEY=YOUR API KEY" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 2, 119 | "metadata": {}, 120 | "outputs": [ 121 | { 122 | "name": "stdout", 123 | "output_type": "stream", 124 | "text": [ 125 | "Success: .env file found with some environment variables\n" 126 | ] 127 | } 128 | ], 129 | "source": [ 130 | "from dotenv import load_dotenv\n", 131 | "import os\n", 132 | "\n", 133 | "if load_dotenv():\n", 134 | " print(\"Success: .env file found with some environment variables\")\n", 135 | "else:\n", 136 | " print(\"Caution: No environment variables found. Please create .env file in the root directory or add environment variables in the .env file\")" 137 | ] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "metadata": {}, 142 | "source": [ 143 | "#### [Option 2] Alternatively, you can set the API key in code. \n", 144 | "However, this is not recommended since it can leave your key exposed for potential misuse. Uncomment the cell below to use this method." 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": 3, 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "#import os\n", 154 | "# os.environ[\"OPENAI_API_KEY\"] = \"sk-proj-******\" #Imp : Replace with an OpenAI API Key" 155 | ] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "metadata": {}, 160 | "source": [ 161 | "We can also test if the key is valid or not" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": 3, 167 | "metadata": {}, 168 | "outputs": [ 169 | { 170 | "name": "stdout", 171 | "output_type": "stream", 172 | "text": [ 173 | "OPENAI_API_KEY is set and is valid\n" 174 | ] 175 | } 176 | ], 177 | "source": [ 178 | "api_key=os.environ[\"OPENAI_API_KEY\"]\n", 179 | "\n", 180 | "import openai\n", 181 | "from openai import OpenAI\n", 182 | "\n", 183 | "client = OpenAI()\n", 184 | "\n", 185 | "\n", 186 | "if api_key:\n", 187 | " try:\n", 188 | " client.models.list()\n", 189 | " print(\"OPENAI_API_KEY is set and is valid\")\n", 190 | " except openai.APIError as e:\n", 191 | " print(f\"OpenAI API returned an API Error: {e}\")\n", 192 | " pass\n", 193 | " except openai.APIConnectionError as e:\n", 194 | " print(f\"Failed to connect to OpenAI API: {e}\")\n", 195 | " pass\n", 196 | " except openai.RateLimitError as e:\n", 197 | " print(f\"OpenAI API request exceeded rate limit: {e}\")\n", 198 | " pass\n", 199 | "\n", 200 | "else:\n", 201 | " print(\"Please set you OpenAI API key as an environment variable OPENAI_API_KEY\")\n", 202 | "\n" 203 | ] 204 | }, 205 | { 206 | "cell_type": "markdown", 207 | "metadata": {}, 208 | "source": [ 209 | "The RAG pipeline takes three inputs - \n", 210 | "1. User Query\n", 211 | "2. Location of the Vector Index (Knowledge base)\n", 212 | "3. Index Name\n", 213 | "\n", 214 | "And generate an answer along with the retrieved documents\n" 215 | ] 216 | }, 217 | { 218 | "cell_type": "markdown", 219 | "metadata": {}, 220 | "source": [ 221 | "#### RAG function" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": 4, 227 | "metadata": {}, 228 | "outputs": [], 229 | "source": [ 230 | "import re\n", 231 | "from langchain_openai import OpenAIEmbeddings, ChatOpenAI\n", 232 | "from langchain_community.vectorstores import FAISS\n", 233 | "\n", 234 | "# Function to clean text\n", 235 | "def clean_text(text):\n", 236 | " # Replace non-breaking space with regular space\n", 237 | " text = text.replace('\\xa0', ' ')\n", 238 | " \n", 239 | " # Remove any HTML tags (if any)\n", 240 | " text = re.sub(r'<[^>]+>', '', text) # Removes HTML tags\n", 241 | " \n", 242 | " # Remove references in brackets (e.g., [7], [39])\n", 243 | " text = re.sub(r'\\[.*?\\]', '', text) # Removes references inside square brackets\n", 244 | " \n", 245 | " # Remove extra spaces and newlines\n", 246 | " text = ' '.join(text.split()) # This will remove extra spaces and newline characters\n", 247 | " \n", 248 | " return text\n", 249 | "\n", 250 | "def rag_function(query, db_path, index_name):\n", 251 | " embeddings=OpenAIEmbeddings(model=\"text-embedding-3-small\")\n", 252 | "\n", 253 | " db=FAISS.load_local(folder_path=db_path, index_name=index_name, embeddings=embeddings, allow_dangerous_deserialization=True)\n", 254 | "\n", 255 | " retrieved_docs = db.similarity_search(query, k=2)\n", 256 | "\n", 257 | " retrieved_context=[clean_text(retrieved_docs[0].page_content + retrieved_docs[1].page_content)]\n", 258 | "\n", 259 | "\n", 260 | " augmented_prompt=f\"\"\"\n", 261 | "\n", 262 | " Given the context below answer the question.\n", 263 | "\n", 264 | " Question: {query} \n", 265 | "\n", 266 | " Context : {retrieved_context}\n", 267 | "\n", 268 | " Remember to answer only based on the context provided and not from any other source. \n", 269 | "\n", 270 | " If the question cannot be answered based on the provided context, say I don’t know.\n", 271 | "\n", 272 | " \"\"\"\n", 273 | "\n", 274 | " llm = ChatOpenAI(\n", 275 | " model=\"gpt-4o-mini\",\n", 276 | " temperature=0,\n", 277 | " max_tokens=None,\n", 278 | " timeout=None,\n", 279 | " max_retries=2\n", 280 | " )\n", 281 | "\n", 282 | " messages=[(\"human\",augmented_prompt)]\n", 283 | "\n", 284 | " ai_msg = llm.invoke(messages)\n", 285 | "\n", 286 | " response=ai_msg.content\n", 287 | "\n", 288 | " return retrieved_context, response\n", 289 | "\n" 290 | ] 291 | }, 292 | { 293 | "cell_type": "markdown", 294 | "metadata": {}, 295 | "source": [ 296 | "Let's try sending our question to this function." 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": 5, 302 | "metadata": {}, 303 | "outputs": [ 304 | { 305 | "data": { 306 | "text/plain": [ 307 | "(['The tournament was contested by ten national teams, maintaining the same format used in 2019 . After six weeks of round-robin matches, India , South Africa , Australia , and New Zealand finished as the top four and qualified for the knockout stage. In the knockout stage, India and Australia beat New Zealand and South Africa, respectively, to advance to the final, played on 19 November at the Narendra Modi Stadium in Ahmedabad . Australia won the final by six wickets, winning their sixth Cricket World Cup title.The host India was the first team to qualify for the semi-finals after their 302-run win against Sri Lanka , their seventh successive win in the World Cup. India secured the top place amongst the semi-finalists after they beat South Africa by 243 runs on 5 November at Eden Gardens in Kolkata .'],\n", 308 | " 'Australia won the world cup.')" 309 | ] 310 | }, 311 | "execution_count": 5, 312 | "metadata": {}, 313 | "output_type": "execute_result" 314 | } 315 | ], 316 | "source": [ 317 | "rag_function(query=\"Who won the world cup?\", db_path=\"../../Assets/Data\", index_name=\"CWC_index\")" 318 | ] 319 | }, 320 | { 321 | "cell_type": "markdown", 322 | "metadata": {}, 323 | "source": [ 324 | "Let's ask another one." 325 | ] 326 | }, 327 | { 328 | "cell_type": "code", 329 | "execution_count": 6, 330 | "metadata": {}, 331 | "outputs": [ 332 | { 333 | "data": { 334 | "text/plain": [ 335 | "(['Virat Kohli was named the player of the tournament and also scored the most runs, while Mohammed Shami was the leading wicket-taker. A total of 1,250,307 spectators attended the matches, the highest number in any Cricket World Cup to date. The tournament final set viewership records in India, drawing 518 million viewers, with a peak of 57 million streaming viewers.The ICC announced its team of the tournament on 21 November 2023, with Virat Kohli being named as player of the tournament , and Rohit Sharma as captain of the team.'],\n", 336 | " 'Virat Kohli was named the player of the tournament and scored the most runs.')" 337 | ] 338 | }, 339 | "execution_count": 6, 340 | "metadata": {}, 341 | "output_type": "execute_result" 342 | } 343 | ], 344 | "source": [ 345 | "rag_function(query=\"What was Virat Kohli's achievement in the Cup?\",db_path=\"../../Assets/Data\", index_name=\"CWC_index\")" 346 | ] 347 | }, 348 | { 349 | "cell_type": "markdown", 350 | "metadata": {}, 351 | "source": [ 352 | "We can also try asking a question which is out of the scope of our knowledge base" 353 | ] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "execution_count": 7, 358 | "metadata": {}, 359 | "outputs": [ 360 | { 361 | "data": { 362 | "text/plain": [ 363 | "(['(RLQ=window.RLQ||).push(function(){mw.config.set({\"wgHostname\":\"mw-web.codfw.main-85db9df4c9-86vj4\",\"wgBackendResponseTime\":174,\"wgPageParseReport\":{\"limitreport\":{\"cputime\":\"2.102\",\"walltime\":\"2.387\",\"ppvisitednodes\":{\"value\":29880,\"limit\":1000000},\"postexpandincludesize\":{\"value\":547658,\"limit\":2097152},\"templateargumentsize\":{\"value\":113569,\"limit\":2097152},\"expansiondepth\":{\"value\":13,\"limit\":100},\"expensivefunctioncount\":{\"value\":22,\"limit\":500},\"unstrip-depth\":{\"value\":1,\"limit\":20},\"unstrip-size\":{\"value\":312186,\"limit\":5000000},\"entityaccesscount\":{\"value\":1,\"limit\":400},\"timingprofile\":[\"100.00% 1812.691 1 -total\",\" 22.76% 412.523 1 Template:Reflist\",\" 14.91% 270.321 37 Template:Cite_web\",\" 11.46% 207.704 58 Template:Single-innings_cricket_match\",\" 11.12% 201.536 1 Template:2023_CWC_and_2025_ICC_CT_sidebar\",\" 10.94% 198.332 1 Template:Sidebar_with_collapsible_lists\",\" 7.79% 141.132 96 Template:Cr\",\" 7.15% 129Background Host selection'],\n", 364 | " 'I don’t know.')" 365 | ] 366 | }, 367 | "execution_count": 7, 368 | "metadata": {}, 369 | "output_type": "execute_result" 370 | } 371 | ], 372 | "source": [ 373 | "rag_function(query=\"What RAG?\",db_path=\"../../Assets/Data\", index_name=\"CWC_index\")" 374 | ] 375 | }, 376 | { 377 | "cell_type": "markdown", 378 | "metadata": {}, 379 | "source": [ 380 | "For some of the questions, the response may be \"I don't know\". That is when the LLM can't find an answer in the retrieved context. In our augmentation step, we had asked the LLM to do so. But how good is this system? We need to be able to evaluate it." 381 | ] 382 | }, 383 | { 384 | "cell_type": "markdown", 385 | "metadata": {}, 386 | "source": [ 387 | "## 2. RAGAs Framework" 388 | ] 389 | }, 390 | { 391 | "cell_type": "markdown", 392 | "metadata": {}, 393 | "source": [ 394 | "[Ragas](https://docs.ragas.io/en/stable/) is a framework that helps you evaluate your Retrieval Augmented Generation (RAG) pipelines. It has been developed by the good folks at [exploding gradients](https://github.com/explodinggradients)." 395 | ] 396 | }, 397 | { 398 | "cell_type": "markdown", 399 | "metadata": {}, 400 | "source": [ 401 | "We will look at this evaluation in 2 parts. \n", 402 | "\n", 403 | "1. Creation of synthetic test data for evaluation.\n", 404 | "2. Calculation of evaluation metrics." 405 | ] 406 | }, 407 | { 408 | "cell_type": "markdown", 409 | "metadata": {}, 410 | "source": [ 411 | "### 2.1 Creation of Synthetic Data" 412 | ] 413 | }, 414 | { 415 | "cell_type": "markdown", 416 | "metadata": {}, 417 | "source": [ 418 | "Synthetic Data Generation uses LLMs to generate diverse questions and answers from the documents in the knowledge base. LLMs can be prompted to create questions like simple questions, multi-context questions, conditional questions, reasoning questions etc. using the documents from the knowledge base as context.\n", 419 | "\n", 420 | "" 421 | ] 422 | }, 423 | { 424 | "cell_type": "code", 425 | "execution_count": 8, 426 | "metadata": {}, 427 | "outputs": [ 428 | { 429 | "name": "stderr", 430 | "output_type": "stream", 431 | "text": [ 432 | "USER_AGENT environment variable not set, consider setting it to identify your requests.\n", 433 | "Fetching pages: 100%|##########| 1/1 [00:00<00:00, 3.25it/s]\n" 434 | ] 435 | } 436 | ], 437 | "source": [ 438 | "from langchain_community.document_loaders import AsyncHtmlLoader\n", 439 | "\n", 440 | "#This is the url of the wikipedia page on the 2023 Cricket World Cup\n", 441 | "url=\"https://en.wikipedia.org/wiki/2023_Cricket_World_Cup\"\n", 442 | "\n", 443 | "#Instantiating the AsyncHtmlLoader\n", 444 | "loader = AsyncHtmlLoader (url)\n", 445 | "\n", 446 | "#Loading the extracted information\n", 447 | "html_data = loader.load()\n", 448 | "\n", 449 | "from langchain_community.document_transformers import Html2TextTransformer\n", 450 | "\n", 451 | "#Instantiate the Html2TextTransformer function\n", 452 | "html2text = Html2TextTransformer()\n", 453 | "\n", 454 | "\n", 455 | "#Call transform_documents\n", 456 | "html_data_transformed = html2text.transform_documents(html_data)" 457 | ] 458 | }, 459 | { 460 | "cell_type": "code", 461 | "execution_count": 13, 462 | "metadata": {}, 463 | "outputs": [], 464 | "source": [ 465 | "from ragas.llms import LangchainLLMWrapper\n", 466 | "from ragas.embeddings import LangchainEmbeddingsWrapper\n", 467 | "\n", 468 | "\n", 469 | "generator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o-mini\"))\n", 470 | "generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings(model=\"text-embedding-3-small\"))" 471 | ] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "execution_count": 15, 476 | "metadata": {}, 477 | "outputs": [ 478 | { 479 | "name": "stderr", 480 | "output_type": "stream", 481 | "text": [ 482 | "Generating personas: 100%|██████████| 1/1 [00:01<00:00, 1.06s/it] \n", 483 | "Generating Scenarios: 100%|██████████| 2/2 [00:07<00:00, 3.67s/it]\n", 484 | "Generating Samples: 100%|██████████| 10/10 [00:02<00:00, 3.40it/s]\n" 485 | ] 486 | } 487 | ], 488 | "source": [ 489 | "from ragas.testset import TestsetGenerator\n", 490 | "\n", 491 | "generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings)\n", 492 | "dataset = generator.generate_with_langchain_docs(html_data_transformed, testset_size=10)" 493 | ] 494 | }, 495 | { 496 | "cell_type": "code", 497 | "execution_count": 16, 498 | "metadata": {}, 499 | "outputs": [], 500 | "source": [ 501 | "sample_queries = dataset.to_pandas()['user_input'].to_list()" 502 | ] 503 | }, 504 | { 505 | "cell_type": "code", 506 | "execution_count": 17, 507 | "metadata": {}, 508 | "outputs": [], 509 | "source": [ 510 | "expected_responses=dataset.to_pandas()['reference'].to_list()" 511 | ] 512 | }, 513 | { 514 | "cell_type": "code", 515 | "execution_count": 19, 516 | "metadata": {}, 517 | "outputs": [], 518 | "source": [ 519 | "dataset_to_eval=[]\n", 520 | "\n", 521 | "for query, reference in zip(sample_queries,expected_responses):\n", 522 | " rag_call_response=rag_function(query=query, db_path=\"../../Assets/Data/\", index_name=\"CWC_index\")\n", 523 | " relevant_docs=rag_call_response[0]\n", 524 | " response=rag_call_response[1]\n", 525 | " dataset_to_eval.append(\n", 526 | " {\n", 527 | " \"user_input\":query,\n", 528 | " \"retrieved_contexts\":relevant_docs,\n", 529 | " \"response\":response,\n", 530 | " \"reference\":reference\n", 531 | " }\n", 532 | " )\n" 533 | ] 534 | }, 535 | { 536 | "cell_type": "code", 537 | "execution_count": 21, 538 | "metadata": {}, 539 | "outputs": [], 540 | "source": [ 541 | "from ragas import EvaluationDataset\n", 542 | "evaluation_dataset = EvaluationDataset.from_list(dataset_to_eval)\n" 543 | ] 544 | }, 545 | { 546 | "cell_type": "code", 547 | "execution_count": 22, 548 | "metadata": {}, 549 | "outputs": [ 550 | { 551 | "name": "stderr", 552 | "output_type": "stream", 553 | "text": [ 554 | "Evaluating: 100%|██████████| 50/50 [00:40<00:00, 1.23it/s]\n" 555 | ] 556 | }, 557 | { 558 | "data": { 559 | "text/plain": [ 560 | "{'context_recall': 0.3867, 'faithfulness': 0.8000, 'answer_correctness': 0.5802, 'answer_relevancy': 0.5674, 'factual_correctness': 0.3810}" 561 | ] 562 | }, 563 | "execution_count": 22, 564 | "metadata": {}, 565 | "output_type": "execute_result" 566 | } 567 | ], 568 | "source": [ 569 | "from ragas import evaluate\n", 570 | "\n", 571 | "evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o-mini\"))\n", 572 | "\n", 573 | "from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness, AnswerCorrectness, ResponseRelevancy\n", 574 | "\n", 575 | "result = evaluate(dataset=evaluation_dataset,metrics=[LLMContextRecall(), Faithfulness(), AnswerCorrectness(), ResponseRelevancy(), FactualCorrectness()],llm=evaluator_llm)\n" 576 | ] 577 | }, 578 | { 579 | "cell_type": "code", 580 | "execution_count": 23, 581 | "metadata": {}, 582 | "outputs": [ 583 | { 584 | "name": "stdout", 585 | "output_type": "stream", 586 | "text": [ 587 | "{'context_recall': 0.3867, 'faithfulness': 0.8000, 'answer_correctness': 0.5802, 'answer_relevancy': 0.5674, 'factual_correctness': 0.3810}\n" 588 | ] 589 | } 590 | ], 591 | "source": [ 592 | "print(result)" 593 | ] 594 | }, 595 | { 596 | "cell_type": "markdown", 597 | "metadata": {}, 598 | "source": [ 599 | "___\n", 600 | "You can interpret the results above. Looks like we are performing well on __faithfulness__ but other metrics are low. How to improve the metrics? We will look at advanced pre-retrieval, retrieval and post retrieval strategies in the next chapter." 601 | ] 602 | }, 603 | { 604 | "cell_type": "markdown", 605 | "metadata": {}, 606 | "source": [ 607 | "---" 608 | ] 609 | }, 610 | { 611 | "cell_type": "markdown", 612 | "metadata": {}, 613 | "source": [ 614 | " \n", 615 | "\n", 616 | "Hi! I'm Abhinav! I am an entrepreneur and Vice President of Artificial Intelligence at Yarnit. I have spent over 15 years consulting and leadership roles in data science, machine learning and AI. My current focus is in the applied Generative AI domain focussing on solving enterprise needs through contextual intelligence. I'm passionate about AI advancements constantly exploring emerging technologies to push the boundaries and create positive impacts in the world. Let’s build the future, together!\n", 617 | "\n", 618 | "[If you haven't already, please subscribe to the MEAP of A Simple Guide to Retrieval Augmented Generation here](https://mng.bz/8wdg)\n", 619 | "\n", 620 | "\n", 621 | " \"New\n", 622 | "\n", 623 | "\n", 624 | "#### If you'd like to chat, I'd be very happy to connect\n", 625 | "\n", 626 | "[![GitHub followers](https://img.shields.io/badge/Github-000000?style=for-the-badge&logo=github&logoColor=black&color=orange)](https://github.com/abhinav-kimothi)\n", 627 | "[![LinkedIn](https://img.shields.io/badge/LinkedIn-000000?style=for-the-badge&logo=linkedin&logoColor=orange&color=black)](https://www.linkedin.com/comm/mynetwork/discovery-see-all?usecase=PEOPLE_FOLLOWS&followMember=abhinav-kimothi)\n", 628 | "[![Medium](https://img.shields.io/badge/Medium-000000?style=for-the-badge&logo=medium&logoColor=black&color=orange)](https://medium.com/@abhinavkimothi)\n", 629 | "[![Insta](https://img.shields.io/badge/Instagram-000000?style=for-the-badge&logo=instagram&logoColor=orange&color=black)](https://www.instagram.com/akaiworks/)\n", 630 | "[![Mail](https://img.shields.io/badge/email-000000?style=for-the-badge&logo=gmail&logoColor=black&color=orange)](mailto:abhinav.kimothi.ds@gmail.com)\n", 631 | "[![X](https://img.shields.io/badge/Follow-000000?style=for-the-badge&logo=X&logoColor=orange&color=black)](https://twitter.com/abhinav_kimothi)\n", 632 | "[![Linktree](https://img.shields.io/badge/Linktree-000000?style=for-the-badge&logo=linktree&logoColor=black&color=orange)](https://linktr.ee/abhinavkimothi)\n", 633 | "[![Gumroad](https://img.shields.io/badge/Gumroad-000000?style=for-the-badge&logo=gumroad&logoColor=orange&color=black)](https://abhinavkimothi.gumroad.com/)\n", 634 | "\n", 635 | "---" 636 | ] 637 | }, 638 | { 639 | "cell_type": "markdown", 640 | "metadata": {}, 641 | "source": [] 642 | } 643 | ], 644 | "metadata": { 645 | "kernelspec": { 646 | "display_name": ".ch5", 647 | "language": "python", 648 | "name": "python3" 649 | }, 650 | "language_info": { 651 | "codemirror_mode": { 652 | "name": "ipython", 653 | "version": 3 654 | }, 655 | "file_extension": ".py", 656 | "mimetype": "text/x-python", 657 | "name": "python", 658 | "nbconvert_exporter": "python", 659 | "pygments_lexer": "ipython3", 660 | "version": "3.13.2" 661 | } 662 | }, 663 | "nbformat": 4, 664 | "nbformat_minor": 2 665 | } 666 | -------------------------------------------------------------------------------- /Chapters/Chapter-06/advanced_rag.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "\n", 8 | " \"New\n", 9 | "" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "# Chapter 06 - Progression of RAG Systems: Naïve to Advanced, and Modular RAG" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "We have familiarized ourselves with the utility of RAG along with the development and evaluation of a basic RAG system. The basic, or the Naïve RAG approach that we have seen so far is, generally, inadequate when it comes to production-grade systems." 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "\n", 31 | " \"Naive\n", 32 | "\n" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "In this chapter we will focus on more advanced concepts in RAG that make RAG possible in production. Let's begin by installing dependencies." 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "## Installing Dependencies" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": {}, 52 | "source": [ 53 | "All the necessary libraries for running this notebook along with their versions can be found in __requirements.txt__ file in the root directory of this repository\n", 54 | "\n", 55 | "You should go to the root directory and run the following command to install the libraries\n", 56 | "\n", 57 | "```\n", 58 | "pip install -r requirements.txt\n", 59 | "```\n", 60 | "\n", 61 | "This is the recommended method of installing the dependencies\n", 62 | "\n", 63 | "___\n", 64 | "Alternatively, you can run the command from this notebook too. The relative path may vary" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 2, 70 | "metadata": {}, 71 | "outputs": [ 72 | { 73 | "name": "stdout", 74 | "output_type": "stream", 75 | "text": [ 76 | "\n", 77 | "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m25.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.0.1\u001b[0m\n", 78 | "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n", 79 | "Note: you may need to restart the kernel to use updated packages.\n" 80 | ] 81 | } 82 | ], 83 | "source": [ 84 | "%pip install -r ../../requirements.txt --quiet" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "metadata": {}, 90 | "source": [ 91 | "## Advanced RAG Techniques" 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "metadata": {}, 97 | "source": [ 98 | "Advanced techniques in RAG have continued to emerge since the earliest experiments with Naïve RAG. There are three stages in which we can discuss these techniques – \n", 99 | "1.\tPre-retrieval Stage: Like the name suggests, there are certain interventions that can be employed before the retriever comes into action. This broadly covers two aspects \n", 100 | " - Index Optimization – The way documents are stored in the knowledge base\n", 101 | " - Query Optimization – Optimizing the user query so it aligns better to the retrieval and generation tasks\n", 102 | "2.\tRetrieval Stage: Certain strategies can improve the recall and precision of the retrieval process. This goes beyond the capability of the underlying retrieval algorithms that we discussed in Chapter 4.\n", 103 | "3.\tPost-retrieval Stage: Once the information has been retrieved, the context can be further optimized to better align with the generation task and the downstream LLM.\n" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": {}, 109 | "source": [ 110 | "\n", 111 | " \"Naive\n", 112 | "" 113 | ] 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "metadata": {}, 118 | "source": [ 119 | "We will explore these techniques one by one." 120 | ] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "metadata": {}, 125 | "source": [ 126 | "To initialize the __OpenAI client__, we need to pass the api key.\n", 127 | "\n", 128 | "Creating a .env file for storing the API key and using it # Recommended\n", 129 | "\n", 130 | "Install the __dotenv__ library\n", 131 | "\n", 132 | "_The dotenv library is a popular tool used in various programming languages, including Python and Node.js, to manage environment variables in development and deployment environments. It allows developers to load environment variables from a .env file into their application's environment._\n", 133 | "\n", 134 | "- Create a file named .env in the root directory of their project.\n", 135 | "- Inside the .env file, then define environment variables in the format VARIABLE_NAME=value. \n", 136 | "\n", 137 | "e.g.\n", 138 | "\n", 139 | "OPENAI_API_KEY=YOUR API KEY" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 3, 145 | "metadata": {}, 146 | "outputs": [ 147 | { 148 | "name": "stdout", 149 | "output_type": "stream", 150 | "text": [ 151 | "Success: .env file found with some environment variables\n" 152 | ] 153 | } 154 | ], 155 | "source": [ 156 | "from dotenv import load_dotenv\n", 157 | "import os\n", 158 | "\n", 159 | "if load_dotenv():\n", 160 | " print(\"Success: .env file found with some environment variables\")\n", 161 | "else:\n", 162 | " print(\"Caution: No environment variables found. Please create .env file in the root directory or add environment variables in the .env file\")" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": 4, 168 | "metadata": {}, 169 | "outputs": [ 170 | { 171 | "name": "stdout", 172 | "output_type": "stream", 173 | "text": [ 174 | "OPENAI_API_KEY is set and is valid\n" 175 | ] 176 | } 177 | ], 178 | "source": [ 179 | "api_key=os.environ[\"OPENAI_API_KEY\"]\n", 180 | "\n", 181 | "from openai import OpenAI\n", 182 | "\n", 183 | "client = OpenAI()\n", 184 | "\n", 185 | "\n", 186 | "if api_key:\n", 187 | " try:\n", 188 | " client.models.list()\n", 189 | " print(\"OPENAI_API_KEY is set and is valid\")\n", 190 | " except openai.APIError as e:\n", 191 | " print(f\"OpenAI API returned an API Error: {e}\")\n", 192 | " pass\n", 193 | " except openai.APIConnectionError as e:\n", 194 | " print(f\"Failed to connect to OpenAI API: {e}\")\n", 195 | " pass\n", 196 | " except openai.RateLimitError as e:\n", 197 | " print(f\"OpenAI API request exceeded rate limit: {e}\")\n", 198 | " pass\n", 199 | "\n", 200 | "else:\n", 201 | " print(\"Please set you OpenAI API key as an environment variable OPENAI_API_KEY\")\n", 202 | "\n" 203 | ] 204 | }, 205 | { 206 | "cell_type": "markdown", 207 | "metadata": {}, 208 | "source": [ 209 | "## 1. Pre-retrieval Stage\n", 210 | "\n", 211 | "The primary objective of employing pre-retrieval techniques is to facilitate better retrieval. Retrieval failures can happen because of 2 reasons.\n", 212 | " \n", 213 | "- Knowledge Base is not suited for retrieval\n", 214 | " \n", 215 | "- Retriever doesn’t completely understand the input query" 216 | ] 217 | }, 218 | { 219 | "cell_type": "markdown", 220 | "metadata": {}, 221 | "source": [ 222 | "### 1.1 INDEX OPTIMIZATION\n", 223 | "\n", 224 | "The objective of index Optimization is to set up the knowledge base for better retrieval. " 225 | ] 226 | }, 227 | { 228 | "cell_type": "markdown", 229 | "metadata": {}, 230 | "source": [ 231 | "#### Context Enriched Chunking\n", 232 | "\n", 233 | "This method adds the summary of the larger document to each chunk to enrich the context of the smaller chunk" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 5, 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "# Import FAISS class from vectorstore library\n", 243 | "from langchain_community.vectorstores import FAISS\n", 244 | "\n", 245 | "# Import OpenAIEmbeddings from the library\n", 246 | "from langchain_openai import OpenAIEmbeddings\n", 247 | "\n", 248 | "# Instantiate the embeddings object\n", 249 | "embeddings=OpenAIEmbeddings(model=\"text-embedding-3-small\")\n" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": 6, 255 | "metadata": {}, 256 | "outputs": [ 257 | { 258 | "name": "stderr", 259 | "output_type": "stream", 260 | "text": [ 261 | "USER_AGENT environment variable not set, consider setting it to identify your requests.\n" 262 | ] 263 | } 264 | ], 265 | "source": [ 266 | "from langchain_community.document_loaders import AsyncHtmlLoader\n", 267 | "from langchain_community.document_transformers import Html2TextTransformer" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": 7, 273 | "metadata": {}, 274 | "outputs": [], 275 | "source": [ 276 | "url=\"https://en.wikipedia.org/wiki/2023_Cricket_World_Cup\"" 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": 8, 282 | "metadata": {}, 283 | "outputs": [ 284 | { 285 | "name": "stderr", 286 | "output_type": "stream", 287 | "text": [ 288 | "Fetching pages: 0%| | 0/1 [00:00 \n", 1211 | "**Part of a series on the** \n", 1212 | "--- \n", 1213 | "2023 Cricket World Cup / \n", 1214 | "2025 ICC Champions Trophy \n", 1215 | "CWC: Category • Commons \n", 1216 | "CT: Category • Commons \n", 1217 | "2023 Cricket World Cup \n", 1218 | "Background\n", 1219 | "\n", 1220 | " * Host selection\n", 1221 | " * COVID-19 pandemic\n", 1222 | " * Format\n", 1223 | " * Pakistan's participation\n", 1224 | " * Prize money\n", 1225 | " * Marketing\n", 1226 | "\n", 1227 | " \n", 1228 | "Stages\n", 1229 | "\n", 1230 | " * Warm-up matches\n", 1231 | "\n", 1232 | " * Group stage\n", 1233 | "\n", 1234 | " * Knockout stage \n", 1235 | " * Semi-finals\n", 1236 | " * Final\n", 1237 | "\n", 1238 | " \n", 1239 | "General Information\n", 1240 | "\n", 1241 | " * Officials\n", 1242 | " * Squads\n", 1243 | " * Statistics\n", 1244 | " * Venues\n", 1245 | "\n", 1246 | " \n", 1247 | "CWC Qualification \n", 1248 | "Overview \n", 1249 | "Super League\n", 1250 | "\n", 1251 | " * 2020–2023 Super League\n", 1252 | "\n", 1253 | " * 2020 \n", 1254 | " * AUS v ENG\n", 1255 | " * NED v ZIM\n", 1256 | " * IRE v ENG\n", 1257 | "\n", 1258 | " * 2020–21 \n", 1259 | " * ENG v IND\n", 1260 | " * IND v AUS\n", 1261 | " * BAN v NZ\n", 1262 | " * PAK v SA\n", 1263 | " * ZIM v PAK\n", 1264 | " * ZIM v SL\n", 1265 | " * ENG v SA\n", 1266 | " * WIN v BAN\n", 1267 | " * IRE v AFG in UAE\n", 1268 | " * SL v WIN\n", 1269 | "\n", 1270 | " * 2021 \n", 1271 | " * SL v BAN\n", 1272 | " * IRE v NED\n", 1273 | " * AUS v WIN\n", 1274 | " * BAN v ZIM\n", 1275 | " * SL v ENG\n", 1276 | " * PAK v ENG\n", 1277 | " * SA v IRE\n", 1278 | " * IND v SL\n", 1279 | " * AFG v SL\n", 1280 | " * ZIM v IRE\n", 1281 | "\n", 1282 | " * 2021–22 \n", 1283 | " * AFG v PAK\n", 1284 | " * SA v SL\n", 1285 | " * ZIM v SL\n", 1286 | " * WIN v IND\n", 1287 | " * AFG v BAN\n", 1288 | " * AUS v PAK\n", 1289 | " * BAN v SA\n", 1290 | " * NED v SA\n", 1291 | " * NED v NZ\n", 1292 | " * IRE v WIN\n", 1293 | " * NED v AFG in Qatar\n", 1294 | " * AFG v IND\n", 1295 | "\n", 1296 | " * 2022 \n", 1297 | " * NZ v IRE\n", 1298 | " * WIN v NED\n", 1299 | " * ENG v NED\n", 1300 | " * WIN v PAK\n", 1301 | " * PAK v NED\n", 1302 | " * AFG v ZIM\n", 1303 | " * NZ v WIN\n", 1304 | " * IND v ZIM\n", 1305 | " * ZIM v AUS\n", 1306 | "\n", 1307 | " * 2022–23 \n", 1308 | " * NZ v AUS\n", 1309 | " * SA v IND\n", 1310 | " * AFG v SL\n", 1311 | " * NZ v PAK\n", 1312 | " * SA v AUS\n", 1313 | " * ENG v SA\n", 1314 | " * ENG v BAN\n", 1315 | " * SL v NZ\n", 1316 | " * NED v ZIM\n", 1317 | "\n", 1318 | " * 2023 \n", 1319 | " * BAN v IRE\n", 1320 | "\n", 1321 | " \n", 1322 | "League 2\n", 1323 | "\n", 1324 | " * 2019–2023 League 2\n", 1325 | "\n", 1326 | " * 2019 \n", 1327 | " * 1\n", 1328 | " * 2\n", 1329 | " * 3\n", 1330 | "\n", 1331 | " * 2020 \n", 1332 | " * 4\n", 1333 | " * 5\n", 1334 | "\n", 1335 | " * 2021 \n", 1336 | " * 6\n", 1337 | " * 7\n", 1338 | " * 8\n", 1339 | "\n", 1340 | " * 2022 \n", 1341 | " * N/A\n", 1342 | " * 9\n", 1343 | " * 10\n", 1344 | " * 11\n", 1345 | " * 12\n", 1346 | " * 13\n", 1347 | " * 14\n", 1348 | " * 15\n", 1349 | " * 16\n", 1350 | " * 17\n", 1351 | " * 18\n", 1352 | "\n", 1353 | " * 2023 \n", 1354 | " * 19\n", 1355 | " * N/A\n", 1356 | " * 20\n", 1357 | " * 21\n", 1358 | "\n", 1359 | " \n", 1360 | "Challenge League\n", 1361 | "\n", 1362 | " * 2019–2022 Challenge League\n", 1363 | "\n", 1364 | " * A \n", 1365 | " * 2019\n", 1366 | " * 2021 (2022)\n", 1367 | " * 2020 (2022)\n", 1368 | "\n", 1369 | " * B \n", 1370 | " * 2019\n", 1371 | " * 2020 (2022)\n", 1372 | " * 2021 (2022)\n", 1373 | "\n", 1374 | " \n", 1375 | "CWC Qualifier\n", 1376 | "\n", 1377 | " * 2023 Qualifier Play-off\n", 1378 | " * 2023 Qualifier\n", 1379 | "\n", 1380 | " \n", 1381 | "2025 ICC Champions Trophy \n", 1382 | "Background\n", 1383 | "\n", 1384 | " * Host selection\n", 1385 | " * Format\n", 1386 | " * India's participation\n", 1387 | " * Prize money\n", 1388 | " * Marketing\n", 1389 | "\n", 1390 | " \n", 1391 | "Stages\n", 1392 | "\n", 1393 | " * Warm-up matches\n", 1394 | "\n", 1395 | " * Group stage \n", 1396 | " * Group A\n", 1397 | " * Group B\n", 1398 | "\n", 1399 | " * Knockout stage \n", 1400 | " * Semi-finals\n", 1401 | " * Final\n", 1402 | "\n", 1403 | " \n", 1404 | "General Information\n", 1405 | "\n", 1406 | " * Officials\n", 1407 | " * Squads\n", 1408 | " * Statistics\n", 1409 | " * Venues\n", 1410 | "\n", 1411 | " \n", 1412 | "← 2019 CWC 2027 → \n", 1413 | "← 2017 CT 2029 → \n", 1414 | " \n", 1415 | " * v\n", 1416 | " * t\n", 1417 | " * e\n", 1418 | "\n", 1419 | " \n", 1420 | " \n", 1421 | "The **2023 ICC Men's Cricket World Cup** was the 13th edition of the ICC Men's\n", 1422 | "Cricket World Cup, a quadrennial One Day International (ODI) cricket\n", 1423 | "tournament organized by the International Cricket Council (ICC). It was hosted\n", 1424 | "from 5 October to 19 November 2023 across ten venues in India. This was the\n", 1425 | "fourth World Cup held in India, but the first where India was the sole host.\n", 1426 | "\n", 1427 | "The tournament was contested by ten national teams, maintaining the same\n", 1428 | "format used in 2019. After six weeks of round-robin matches, India, South\n", 1429 | "Africa, Australia, and New Zealand finished as the top four and qualified for\n", 1430 | "the knockout stage. In the knockout stage, India and Australia beat New\n", 1431 | "Zealand and South Africa, respectively, to advance to the final, played on 19\n", 1432 | "November at the Narendra Modi Stadium in Ahmedabad. Australia won the final by\n", 1433 | "six wickets, winning their sixth Cricket World Cup title.\n", 1434 | "\n", 1435 | "Virat Kohli was named the player of the tournament and also scored the most\n", 1436 | "runs, while Mohammed Shami was the leading wicket-taker. A total of 1,250,307\n", 1437 | "spectators attended the matches, the highest number in any Cricket World Cup\n", 1438 | "to date.[1] The tournament final set viewership records in India, drawing 518\n", 1439 | "million viewers, with a peak of 57 million streaming viewers.\n", 1440 | "\n", 1441 | "## Background\n", 1442 | "\n", 1443 | "### Host selection\n", 1444 | "\n", 1445 | "On 11 December 2017, India was announced by the ICC as hosts of the 2023\n", 1446 | "Cricket World Cup; while India had served as a co-host during three previous\n", 1447 | "tournaments (most recently in 2011, which it co-hosted with Sri Lanka and\n", 1448 | "Bangladesh), it would mark the first Cricket World Cup to be hosted solely by\n", 1449 | "India.[2]\n", 1450 | "\n", 1451 | "### COVID-19 pandemic\n", 1452 | "\n", 1453 | "Further information: Impact of the COVID-19 pandemic on cricket\n", 1454 | "\n", 1455 | "Originally, the competition was to be played from 9 February to 26 March\n", 1456 | "2023.[3][4] In July 2020 it was announced that due to the disruption of the\n", 1457 | "qualification schedule by the COVID-19 pandemic, the start of the tournament\n", 1458 | "would be delayed to October.[5][6] The ICC released the tournament schedule on\n", 1459 | "27 June 2023.[7][8]\n", 1460 | "\n", 1461 | "### Format\n", 1462 | "\n", 1463 | "This was the first ICC World Cup in which penalties for slow over-rates were\n", 1464 | "given to bowling sides if they did not complete their 50 overs in the\n", 1465 | "stipulated time. On-field umpires could penalise the bowling team by not\n", 1466 | "allowing more than four fielders outside the 30-yard circle.[9]\n", 1467 | "\n", 1468 | "### Pakistan's participation\n", 1469 | "\n", 1470 | "The Pakistan Cricket Board (PCB) had threatened to boycott the tournament\n", 1471 | "after the Board of Control for Cricket in India (BCCI) refused to send a team\n", 1472 | "to the 2023 Asia Cup scheduled in Pakistan.[10][11] This issue was resolved in\n", 1473 | "June 2023 after the Asian Cricket Council announced that the tournament would\n", 1474 | "be hosted using a hybrid model proposed by the PCB, with nine of the 13\n", 1475 | "matches in the competition played in Sri Lanka.[12][13]\n", 1476 | "\n", 1477 | "### Prize money\n", 1478 | "\n", 1479 | "The ICC allocated a pool of US$10 million in prize money for the tournament,\n", 1480 | "with payouts remaining the same as the 2019 and 2015 tournaments. Australia,\n", 1481 | "the winning team, received US$4,000,000, the runner-up $2,000,000 and the\n", 1482 | "losing semi-finalists $1,600,000. Teams that did not progress past the league\n", 1483 | "stage received $100,000 and the winner of each league stage match received\n", 1484 | "$40,000.[14][15]\n", 1485 | "\n", 1486 | "### Marketing\n", 1487 | "\n", 1488 | "The ICC hosted a trophy tour for 100 days prior to the tournament beginning 27\n", 1489 | "June, with the Cricket World Cup Trophy being taken to various locations\n", 1490 | "around the world. The event began with the launching of the trophy into the\n", 1491 | "stratosphere by Sent Into Space and landing at Modi Stadium—becoming the first\n", 1492 | "sports trophy to have ever been sent into space.[16] The ICC officially\n", 1493 | "announced the mascots for the World Cup in August. The mascots were a male and\n", 1494 | "female duo named \"Tonk\" and \"Blaze\" from the fictional cricketing utopia\n", 1495 | "\"Crictoverse\".[17][18]\n", 1496 | "\n", 1497 | "Ahead of the tournament, it was reported that an opening ceremony would take\n", 1498 | "place on 4 October 2023 at the Narendra Modi Stadium in Ahmedabad, a day\n", 1499 | "before the opening match at the same venue.[19] The official theme song of the\n", 1500 | "2023 Cricket World Cup titled \"Dil Jashn Bole\" (transl. Heart say celebrate)\n", 1501 | "was released on 20 September. The song was composed by Pritam, and was sung by\n", 1502 | "Pritam, Nakash Aziz, Sreerama Chandra, Amit Mishra, Jonita Gandhi, Akasa Singh\n", 1503 | "and S. P. Charan.[20] However, the song was subject to backlash and bad\n", 1504 | "reviews.[21] The opening ceremony was cancelled and replaced by a closing\n", 1505 | "ceremony ahead of the final.[22] During this a drone show was held.[23][24]\n", 1506 | "\n", 1507 | "## Qualification\n", 1508 | "\n", 1509 | "Highlighted are the countries that participated in the 2023 Cricket World Cup.\n", 1510 | "\n", 1511 | "Qualified as host\n", 1512 | "\n", 1513 | "Qualified via the 2020–2023 Super League\n", 1514 | "\n", 1515 | "Qualified via the 2023 Qualifier\n", 1516 | "\n", 1517 | "Participated in the qualifier but failed to qualify\n", 1518 | "Metadata: {'source': 'https://en.wikipedia.org/wiki/2023_Cricket_World_Cup', 'category': 'cricket world cup', 'extracted_metadata': {'player_1': 'Virat Kohli', 'player_2': 'Mohammed Shami', 'player_3': '', 'player_4': '', 'player_5': '', 'team_1': 'Australia', 'team_2': 'India', 'team_3': 'New Zealand', 'team_4': 'South Africa', 'team_5': '', 'keyword_1': '2023 Cricket World Cup', 'keyword_2': 'One Day International', 'keyword_3': 'International Cricket Council', 'keyword_4': 'Knockout stage', 'keyword_5': 'Prize money'}}\n" 1519 | ] 1520 | } 1521 | ], 1522 | "source": [ 1523 | "import faiss\n", 1524 | "from langchain_community.vectorstores import FAISS\n", 1525 | "from langchain_community.docstore.in_memory import InMemoryDocstore\n", 1526 | "from langchain_openai import OpenAIEmbeddings\n", 1527 | "from langchain_core.documents import Document\n", 1528 | "from langchain_community.document_loaders import AsyncHtmlLoader\n", 1529 | "from langchain_community.document_transformers import Html2TextTransformer\n", 1530 | "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", 1531 | "from openai import OpenAI\n", 1532 | "from langchain_openai import ChatOpenAI\n", 1533 | "\n", 1534 | "# Initialize the OpenAI client\n", 1535 | "client = OpenAI()\n", 1536 | "\n", 1537 | "# Function to extract fixed metadata using GPT-4o-mini with JSON response\n", 1538 | "def extract_fixed_metadata_from_chunk(chunk_text):\n", 1539 | " prompt = f\"\"\"\n", 1540 | " Extract the following fixed metadata in JSON format from the given text:\n", 1541 | " {{\n", 1542 | " \"player_1\": \"\",\n", 1543 | " \"player_2\": \"\",\n", 1544 | " \"player_3\": \"\",\n", 1545 | " \"player_4\": \"\",\n", 1546 | " \"player_5\": \"\",\n", 1547 | " \"team_1\": \"\",\n", 1548 | " \"team_2\": \"\",\n", 1549 | " \"team_3\": \"\",\n", 1550 | " \"team_4\": \"\",\n", 1551 | " \"team_5\": \"\",\n", 1552 | " \"keyword_1\": \"\",\n", 1553 | " \"keyword_2\": \"\",\n", 1554 | " \"keyword_3\": \"\",\n", 1555 | " \"keyword_4\": \"\",\n", 1556 | " \"keyword_5\": \"\"\n", 1557 | " }}\n", 1558 | " Here's the text:\n", 1559 | " {chunk_text}\n", 1560 | " \"\"\"\n", 1561 | "\n", 1562 | " llm = ChatOpenAI(\n", 1563 | " model=\"gpt-4o-mini\",\n", 1564 | " temperature=0,\n", 1565 | " max_tokens=None,\n", 1566 | " timeout=None,\n", 1567 | " max_retries=2\n", 1568 | " )\n", 1569 | "\n", 1570 | " json_llm = llm.bind(response_format={\"type\": \"json_object\"})\n", 1571 | "\n", 1572 | "\n", 1573 | " #Craft the prompt message\n", 1574 | " messages=[(\"human\",prompt)]\n", 1575 | "\n", 1576 | "\n", 1577 | " # Invoke the LLM\n", 1578 | " ai_msg = json_llm.invoke(messages)\n", 1579 | " \n", 1580 | "\n", 1581 | " \n", 1582 | " # Extract the response in JSON format\n", 1583 | " metadata_response = ai_msg.content\n", 1584 | " print(metadata_response)\n", 1585 | " try:\n", 1586 | " # Convert the response into a dictionary\n", 1587 | " metadata = eval(metadata_response) # This ensures it is a valid dictionary\n", 1588 | " except Exception as e:\n", 1589 | " print(f\"Error parsing metadata: {e}\")\n", 1590 | " metadata = {\n", 1591 | " \"player_1\": \"\", \"player_2\": \"\", \"player_3\": \"\", \"player_4\": \"\", \"player_5\": \"\",\n", 1592 | " \"team_1\": \"\", \"team_2\": \"\", \"team_3\": \"\", \"team_4\": \"\", \"team_5\": \"\",\n", 1593 | " \"keyword_1\": \"\", \"keyword_2\": \"\", \"keyword_3\": \"\", \"keyword_4\": \"\", \"keyword_5\": \"\"\n", 1594 | " }\n", 1595 | " return metadata\n", 1596 | "\n", 1597 | "# Step 1: Load data from a URL (Wikipedia page)\n", 1598 | "url = \"https://en.wikipedia.org/wiki/2023_Cricket_World_Cup\"\n", 1599 | "loader = AsyncHtmlLoader(url)\n", 1600 | "data = loader.load()\n", 1601 | "\n", 1602 | "# Step 2: Transform the HTML content to plain text\n", 1603 | "html2text = Html2TextTransformer()\n", 1604 | "data_transformed = html2text.transform_documents(data)\n", 1605 | "\n", 1606 | "# Step 3: Split the text into smaller chunks using RecursiveCharacterTextSplitter\n", 1607 | "text_splitter = RecursiveCharacterTextSplitter(\n", 1608 | " chunk_size=10000, # Number of characters in each chunk\n", 1609 | " chunk_overlap=200 # Number of overlapping characters between chunks\n", 1610 | ")\n", 1611 | "chunks = text_splitter.split_text(data_transformed[0].page_content)\n", 1612 | "\n", 1613 | "# Step 4: Initialize OpenAI Embeddings model\n", 1614 | "embedding_model = OpenAIEmbeddings(model=\"text-embedding-3-small\")\n", 1615 | "\n", 1616 | "# Step 5: Initialize FAISS index for L2 (Euclidean) distance\n", 1617 | "embedding_dim = len(embedding_model.embed_query(\"hello world\"))\n", 1618 | "index = faiss.IndexFlatL2(embedding_dim)\n", 1619 | "\n", 1620 | "# Step 6: Initialize the InMemoryDocstore to store documents and metadata in memory\n", 1621 | "docstore = InMemoryDocstore()\n", 1622 | "\n", 1623 | "# Step 7: Create FAISS vector store using the embedding function, FAISS index, and docstore\n", 1624 | "vector_store = FAISS(\n", 1625 | " embedding_function=embedding_model,\n", 1626 | " index=index,\n", 1627 | " docstore=docstore,\n", 1628 | " index_to_docstore_id={}\n", 1629 | ")\n", 1630 | "\n", 1631 | "# Step 8: Add chunks (documents) with extracted metadata and embeddings to FAISS vector store\n", 1632 | "documents = []\n", 1633 | "for i, chunk in enumerate(chunks):\n", 1634 | " # Extract fixed metadata using the LLM\n", 1635 | " extracted_metadata = extract_fixed_metadata_from_chunk(chunk)\n", 1636 | " \n", 1637 | " # Create a document object with both the chunk content and the extracted metadata\n", 1638 | " document = Document(\n", 1639 | " page_content=chunk, \n", 1640 | " metadata={\n", 1641 | " \"source\": url, \n", 1642 | " \"category\": \"cricket world cup\",\n", 1643 | " \"extracted_metadata\": extracted_metadata # Store the structured metadata\n", 1644 | " }\n", 1645 | " )\n", 1646 | " \n", 1647 | " # Append the document to the list\n", 1648 | " documents.append(document)\n", 1649 | "\n", 1650 | "# Create unique IDs for each chunk\n", 1651 | "ids = [f\"chunk_{i}\" for i in range(len(chunks))]\n", 1652 | "\n", 1653 | "# Add the documents and their embeddings to the FAISS vector store\n", 1654 | "vector_store.add_documents(documents=documents, ids=ids)\n", 1655 | "\n", 1656 | "# Step 9: Define a function to extract metadata from a query\n", 1657 | "def extract_fixed_metadata_from_query(query_text):\n", 1658 | " prompt = f\"\"\"\n", 1659 | " Extract the following fixed metadata in JSON format from the query:\n", 1660 | " {{\n", 1661 | " \"player_1\": \"\",\n", 1662 | " \"player_2\": \"\",\n", 1663 | " \"player_3\": \"\",\n", 1664 | " \"player_4\": \"\",\n", 1665 | " \"player_5\": \"\",\n", 1666 | " \"team_1\": \"\",\n", 1667 | " \"team_2\": \"\",\n", 1668 | " \"team_3\": \"\",\n", 1669 | " \"team_4\": \"\",\n", 1670 | " \"team_5\": \"\",\n", 1671 | " \"keyword_1\": \"\",\n", 1672 | " \"keyword_2\": \"\",\n", 1673 | " \"keyword_3\": \"\",\n", 1674 | " \"keyword_4\": \"\",\n", 1675 | " \"keyword_5\": \"\"\n", 1676 | " }}\n", 1677 | " Here's the query:\n", 1678 | " {query_text}\n", 1679 | " \"\"\"\n", 1680 | " \n", 1681 | "\n", 1682 | " llm = ChatOpenAI(\n", 1683 | " model=\"gpt-4o-mini\",\n", 1684 | " temperature=0,\n", 1685 | " max_tokens=None,\n", 1686 | " timeout=None,\n", 1687 | " max_retries=2\n", 1688 | " )\n", 1689 | "\n", 1690 | " json_llm = llm.bind(response_format={\"type\": \"json_object\"})\n", 1691 | "\n", 1692 | "\n", 1693 | " #Craft the prompt message\n", 1694 | " messages=[(\"human\",prompt)]\n", 1695 | "\n", 1696 | "\n", 1697 | " # Invoke the LLM\n", 1698 | " ai_msg = json_llm.invoke(messages)\n", 1699 | "\n", 1700 | "\n", 1701 | "\n", 1702 | "\n", 1703 | " # Extract the response in JSON format\n", 1704 | " metadata_response = ai_msg.content\n", 1705 | " try:\n", 1706 | " # Convert the response into a dictionary\n", 1707 | " metadata = eval(metadata_response)\n", 1708 | " except Exception as e:\n", 1709 | " print(f\"Error parsing metadata: {e}\")\n", 1710 | " metadata = {\n", 1711 | " \"player_1\": \"\", \"player_2\": \"\", \"player_3\": \"\", \"player_4\": \"\", \"player_5\": \"\",\n", 1712 | " \"team_1\": \"\", \"team_2\": \"\", \"team_3\": \"\", \"team_4\": \"\", \"team_5\": \"\",\n", 1713 | " \"keyword_1\": \"\", \"keyword_2\": \"\", \"keyword_3\": \"\", \"keyword_4\": \"\", \"keyword_5\": \"\"\n", 1714 | " }\n", 1715 | " return metadata\n", 1716 | "\n", 1717 | "# Step 10: Extract metadata from the query\n", 1718 | "query = \"Virat Kohli records in 2023 Cricket World Cup\"\n", 1719 | "query_metadata = extract_fixed_metadata_from_query(query)\n", 1720 | "\n", 1721 | "# Step 11: Define a metadata filter based on the query's extracted metadata\n", 1722 | "def metadata_filter(doc_metadata):\n", 1723 | " query_players = {query_metadata[f\"player_{i}\"] for i in range(1, 6) if query_metadata[f\"player_{i}\"]}\n", 1724 | " query_teams = {query_metadata[f\"team_{i}\"] for i in range(1, 6) if query_metadata[f\"team_{i}\"]}\n", 1725 | " query_keywords = {query_metadata[f\"keyword_{i}\"] for i in range(1, 6) if query_metadata[f\"keyword_{i}\"]}\n", 1726 | " doc_players = {doc_metadata[\"extracted_metadata\"][f\"player_{i}\"] for i in range(1, 6) if doc_metadata[\"extracted_metadata\"][f\"player_{i}\"]}\n", 1727 | " doc_teams = {doc_metadata[\"extracted_metadata\"][f\"team_{i}\"] for i in range(1, 6) if doc_metadata[\"extracted_metadata\"][f\"team_{i}\"]}\n", 1728 | " doc_keywords = {doc_metadata[\"extracted_metadata\"][f\"keyword_{i}\"] for i in range(1, 6) if doc_metadata[\"extracted_metadata\"][f\"keyword_{i}\"]}\n", 1729 | " \n", 1730 | " # Check if there's any overlap between the query metadata and document metadata\n", 1731 | " return bool(query_players & doc_players or query_teams & doc_teams or query_keywords & doc_keywords)\n", 1732 | "\n", 1733 | "# Step 12: Perform a similarity search on the stored chunks with the metadata filter\n", 1734 | "results = vector_store.similarity_search(query=query, k=3, filter=metadata_filter)\n", 1735 | "\n", 1736 | "# Step 13: Display the results with metadata\n", 1737 | "for doc in results:\n", 1738 | " print(f\"Document: {doc.page_content}\")\n", 1739 | " print(f\"Metadata: {doc.metadata}\")\n" 1740 | ] 1741 | }, 1742 | { 1743 | "cell_type": "markdown", 1744 | "metadata": {}, 1745 | "source": [ 1746 | "### 1.2 QUERY OPTIMIZATION\n", 1747 | "\n", 1748 | "The objective of this stage is to optimize the input user query in a manner that makes it better suited for the retrieval tasks" 1749 | ] 1750 | }, 1751 | { 1752 | "cell_type": "markdown", 1753 | "metadata": {}, 1754 | "source": [ 1755 | "#### Query Expansion\n", 1756 | "\n", 1757 | "In query expansion, the original user query is enriched with the aim of retrieving more relevant information. This helps in increasing the recall of the system and overcomes the challenge of incomplete or very brief user queries." 1758 | ] 1759 | }, 1760 | { 1761 | "cell_type": "code", 1762 | "execution_count": 28, 1763 | "metadata": {}, 1764 | "outputs": [], 1765 | "source": [ 1766 | "original_query=\"How does climate change affect polar bears?\"\n", 1767 | "num=5" 1768 | ] 1769 | }, 1770 | { 1771 | "cell_type": "code", 1772 | "execution_count": 29, 1773 | "metadata": {}, 1774 | "outputs": [], 1775 | "source": [ 1776 | "response_structure='''\n", 1777 | "{\n", 1778 | " \"queries\": [\n", 1779 | " {\n", 1780 | " \"query\": \"query\",\n", 1781 | " },\n", 1782 | " ...\n", 1783 | "]}\n", 1784 | "'''" 1785 | ] 1786 | }, 1787 | { 1788 | "cell_type": "code", 1789 | "execution_count": 30, 1790 | "metadata": {}, 1791 | "outputs": [ 1792 | { 1793 | "name": "stderr", 1794 | "output_type": "stream", 1795 | "text": [ 1796 | "<>:1: SyntaxWarning: invalid escape sequence '\\S'\n", 1797 | "<>:1: SyntaxWarning: invalid escape sequence '\\S'\n", 1798 | "/var/folders/kz/0m0zvdwn54798h3cfz47bcjw0000gn/T/ipykernel_29711/22333251.py:1: SyntaxWarning: invalid escape sequence '\\S'\n", 1799 | " expansion_prompt=f\"Generate {num} variations of the following query: {original_query}. Respond in JSON format.\\Stick to this Structure :\\n{response_structure}\"\n" 1800 | ] 1801 | } 1802 | ], 1803 | "source": [ 1804 | "expansion_prompt=f\"Generate {num} variations of the following query: {original_query}. Respond in JSON format.\\Stick to this Structure :\\n{response_structure}\"" 1805 | ] 1806 | }, 1807 | { 1808 | "cell_type": "code", 1809 | "execution_count": 31, 1810 | "metadata": {}, 1811 | "outputs": [], 1812 | "source": [ 1813 | "step_back_expansion_prompt = f\"Given the query: '{original_query}', generate a more abstract, higher-level conceptual query.\"" 1814 | ] 1815 | }, 1816 | { 1817 | "cell_type": "code", 1818 | "execution_count": 32, 1819 | "metadata": {}, 1820 | "outputs": [], 1821 | "source": [ 1822 | "sub_query_expansion_prompt=f\"Break down the following query into {num} sub-queries targeting different aspects of the query: '{original_query}'. Respond in JSON format.\"\n" 1823 | ] 1824 | }, 1825 | { 1826 | "cell_type": "code", 1827 | "execution_count": 33, 1828 | "metadata": {}, 1829 | "outputs": [], 1830 | "source": [ 1831 | "# Importing the OpenAI library\n", 1832 | "from openai import OpenAI\n", 1833 | "\n", 1834 | "# Instantiate the OpenAI client\n", 1835 | "client = OpenAI()\n", 1836 | "\n", 1837 | "# Make the API call passing the augmented prompt to the LLM\n", 1838 | "response = client.chat.completions.create(\n", 1839 | " model=\"gpt-4o-mini\",\n", 1840 | " messages=\t[\n", 1841 | " {\"role\": \"user\", \"content\": expansion_prompt}\n", 1842 | " \t\t],\n", 1843 | " response_format={ \"type\": \"json_object\" }\n", 1844 | ")\n", 1845 | "\n", 1846 | "# Extract the answer from the response object\n", 1847 | "answer=response.choices[0].message.content" 1848 | ] 1849 | }, 1850 | { 1851 | "cell_type": "code", 1852 | "execution_count": 34, 1853 | "metadata": {}, 1854 | "outputs": [ 1855 | { 1856 | "name": "stdout", 1857 | "output_type": "stream", 1858 | "text": [ 1859 | "{\n", 1860 | " \"queries\": [\n", 1861 | " {\n", 1862 | " \"query\": \"What impact does climate change have on polar bear populations?\"\n", 1863 | " },\n", 1864 | " {\n", 1865 | " \"query\": \"In what ways does climate change influence the habitat of polar bears?\"\n", 1866 | " },\n", 1867 | " {\n", 1868 | " \"query\": \"How are polar bears being affected by global warming?\"\n", 1869 | " },\n", 1870 | " {\n", 1871 | " \"query\": \"What are the consequences of climate change for polar bear survival?\"\n", 1872 | " },\n", 1873 | " {\n", 1874 | " \"query\": \"How is the behavior of polar bears changing due to climate change?\"\n", 1875 | " }\n", 1876 | " ]\n", 1877 | "}\n" 1878 | ] 1879 | } 1880 | ], 1881 | "source": [ 1882 | "print(answer)" 1883 | ] 1884 | }, 1885 | { 1886 | "cell_type": "code", 1887 | "execution_count": 35, 1888 | "metadata": {}, 1889 | "outputs": [], 1890 | "source": [ 1891 | "\n", 1892 | "\n", 1893 | "# Make the API call passing the augmented prompt to the LLM\n", 1894 | "response = client.chat.completions.create(\n", 1895 | " model=\"gpt-4o-mini\",\n", 1896 | " messages=\t[\n", 1897 | " {\"role\": \"user\", \"content\": step_back_expansion_prompt}\n", 1898 | " ]\n", 1899 | ")\n", 1900 | "\n", 1901 | "# Extract the answer from the response object\n", 1902 | "answer=response.choices[0].message.content" 1903 | ] 1904 | }, 1905 | { 1906 | "cell_type": "code", 1907 | "execution_count": 36, 1908 | "metadata": {}, 1909 | "outputs": [ 1910 | { 1911 | "name": "stdout", 1912 | "output_type": "stream", 1913 | "text": [ 1914 | "\"What are the broader ecological and environmental impacts of climate change on specialized species in arctic ecosystems?\"\n" 1915 | ] 1916 | } 1917 | ], 1918 | "source": [ 1919 | "print(answer)" 1920 | ] 1921 | }, 1922 | { 1923 | "cell_type": "code", 1924 | "execution_count": 37, 1925 | "metadata": {}, 1926 | "outputs": [], 1927 | "source": [ 1928 | "\n", 1929 | "# Make the API call passing the augmented prompt to the LLM\n", 1930 | "response = client.chat.completions.create(\n", 1931 | " model=\"gpt-4o-mini\",\n", 1932 | " messages=\t[\n", 1933 | " {\"role\": \"user\", \"content\": sub_query_expansion_prompt}\n", 1934 | " ],\n", 1935 | " response_format={ \"type\": \"json_object\" }\n", 1936 | ")\n", 1937 | "\n", 1938 | "# Extract the answer from the response object\n", 1939 | "answer=response.choices[0].message.content" 1940 | ] 1941 | }, 1942 | { 1943 | "cell_type": "code", 1944 | "execution_count": 38, 1945 | "metadata": {}, 1946 | "outputs": [ 1947 | { 1948 | "name": "stdout", 1949 | "output_type": "stream", 1950 | "text": [ 1951 | "{\n", 1952 | " \"sub_queries\": [\n", 1953 | " {\n", 1954 | " \"id\": 1,\n", 1955 | " \"focus\": \"Impact on Habitat\",\n", 1956 | " \"query\": \"What changes in habitat occur for polar bears due to climate change?\"\n", 1957 | " },\n", 1958 | " {\n", 1959 | " \"id\": 2,\n", 1960 | " \"focus\": \"Food Availability\",\n", 1961 | " \"query\": \"How does climate change affect the availability of food sources for polar bears?\"\n", 1962 | " },\n", 1963 | " {\n", 1964 | " \"id\": 3,\n", 1965 | " \"focus\": \"Reproductive Health\",\n", 1966 | " \"query\": \"What are the effects of climate change on the reproductive health and population dynamics of polar bears?\"\n", 1967 | " },\n", 1968 | " {\n", 1969 | " \"id\": 4,\n", 1970 | " \"focus\": \"Behavioral Changes\",\n", 1971 | " \"query\": \"How does climate change influence the behavior and migration patterns of polar bears?\"\n", 1972 | " },\n", 1973 | " {\n", 1974 | " \"id\": 5,\n", 1975 | " \"focus\": \"Long-term Survival\",\n", 1976 | " \"query\": \"What are the long-term implications of climate change on the survival of polar bear populations?\"\n", 1977 | " }\n", 1978 | " ]\n", 1979 | "}\n" 1980 | ] 1981 | } 1982 | ], 1983 | "source": [ 1984 | "print(answer)" 1985 | ] 1986 | }, 1987 | { 1988 | "cell_type": "markdown", 1989 | "metadata": {}, 1990 | "source": [ 1991 | "#### Query Transformation\n", 1992 | "\n", 1993 | "Compared to query expansion, in query transformation, instead of the original user query retrieval happens on a transformed query which is more suitable for the retriever" 1994 | ] 1995 | }, 1996 | { 1997 | "cell_type": "code", 1998 | "execution_count": 39, 1999 | "metadata": {}, 2000 | "outputs": [], 2001 | "source": [ 2002 | "original_query=\"How does climate change affect polar bears?\"" 2003 | ] 2004 | }, 2005 | { 2006 | "cell_type": "code", 2007 | "execution_count": 40, 2008 | "metadata": {}, 2009 | "outputs": [], 2010 | "source": [ 2011 | "system_prompt=\"You are an expert in climate change and arctic life.\"\n", 2012 | "hyde_prompt=f\"Generate an answer to the question: {original_query}\"" 2013 | ] 2014 | }, 2015 | { 2016 | "cell_type": "code", 2017 | "execution_count": 41, 2018 | "metadata": {}, 2019 | "outputs": [], 2020 | "source": [ 2021 | "\n", 2022 | "# Make the API call passing the augmented prompt to the LLM\n", 2023 | "response = client.chat.completions.create(\n", 2024 | " model=\"gpt-4o-mini\",\n", 2025 | " messages=\t[\n", 2026 | " {\"role\": \"system\", \"content\": system_prompt},\n", 2027 | " {\"role\": \"user\", \"content\": hyde_prompt}\n", 2028 | " ]\n", 2029 | ")\n", 2030 | "\n", 2031 | "# Extract the answer from the response object\n", 2032 | "answer=response.choices[0].message.content" 2033 | ] 2034 | }, 2035 | { 2036 | "cell_type": "code", 2037 | "execution_count": 42, 2038 | "metadata": {}, 2039 | "outputs": [ 2040 | { 2041 | "name": "stdout", 2042 | "output_type": "stream", 2043 | "text": [ 2044 | "Climate change significantly affects polar bears primarily through the loss of sea ice, which is crucial for their survival. Here are some key ways in which climate change impacts polar bears:\n", 2045 | "\n", 2046 | "1. **Loss of Habitat**: Polar bears depend on sea ice as a platform for hunting seals, their primary food source. As global temperatures rise, sea ice is melting at an alarming rate during the summer months and forming later in the fall. This reduced availability of ice means bears have to swim longer distances to find food.\n", 2047 | "\n", 2048 | "2. **Decreased Prey Availability**: As ice melts, the seals that polar bears hunt also face challenges. With the loss of stable ice platforms, seal populations may decrease, ultimately leading to food scarcity for polar bears. This reduced access to prey impacts their health, reproduction, and survival rates.\n", 2049 | "\n", 2050 | "3. **Increased Energy Expenditure**: With the melting ice, polar bears may have to travel further in search of food. This increased energy expenditure can lead to fatigue and decreased body condition, especially for mothers who are nursing cubs. Hungry bears are less likely to successfully raise their young.\n", 2051 | "\n", 2052 | "4. **Climate-Induced Stressors**: The stress of adapting to rapidly changing environments can lead to behavioral changes in polar bears. Increased competition for dwindling resources can also lead to aggression among bears, particularly in areas where human activities are encroaching on their habitat.\n", 2053 | "\n", 2054 | "5. **Reproductive Challenges**: The combined effects of reduced food availability and habitat loss affect polar bear reproduction. Poor body condition can result in lower birth rates and higher cub mortality, threatening the long-term viability of polar bear populations.\n", 2055 | "\n", 2056 | "6. **Increased Human-Bear Conflicts**: As bears are forced to venture onto land more frequently due to diminishing sea ice, encounters with human populations can increase. This may lead to more conflicts, potential dangers for both bears and humans, and could affect the conservation strategies in these areas.\n", 2057 | "\n", 2058 | "In summary, climate change poses a significant threat to polar bears by disrupting their habitat, food sources, and overall survival. Protecting their environment and mitigating climate change impacts are crucial for the future of these iconic Arctic mammals.\n" 2059 | ] 2060 | } 2061 | ], 2062 | "source": [ 2063 | "print(answer)" 2064 | ] 2065 | }, 2066 | { 2067 | "cell_type": "code", 2068 | "execution_count": 44, 2069 | "metadata": {}, 2070 | "outputs": [ 2071 | { 2072 | "name": "stdout", 2073 | "output_type": "stream", 2074 | "text": [ 2075 | "The embedding dimension is: 1536\n" 2076 | ] 2077 | } 2078 | ], 2079 | "source": [ 2080 | "# Initialize the OpenAIEmbeddings model\n", 2081 | "embeddings = OpenAIEmbeddings(model=\"text-embedding-3-small\")\n", 2082 | "\n", 2083 | "# Create embedding for the hypothetical answer\n", 2084 | "hyde_embedding = embeddings.embed_query(answer)\n", 2085 | "\n", 2086 | "# Check and print the dimension of the embedding\n", 2087 | "embedding_dimension = len(hyde_embedding)\n", 2088 | "print(f\"The embedding dimension is: {embedding_dimension}\")\n" 2089 | ] 2090 | }, 2091 | { 2092 | "cell_type": "markdown", 2093 | "metadata": {}, 2094 | "source": [ 2095 | "## 2. Retrieval Strategies" 2096 | ] 2097 | }, 2098 | { 2099 | "cell_type": "markdown", 2100 | "metadata": {}, 2101 | "source": [ 2102 | "Interventions in the pre-retrieval stage can bring significant improvements in the performance of the RAG system if the query and the knowledge base becomes well aligned with the retrieval algorithm. " 2103 | ] 2104 | }, 2105 | { 2106 | "cell_type": "markdown", 2107 | "metadata": {}, 2108 | "source": [ 2109 | "#### Hybrid Retrieval" 2110 | ] 2111 | }, 2112 | { 2113 | "cell_type": "code", 2114 | "execution_count": 46, 2115 | "metadata": {}, 2116 | "outputs": [ 2117 | { 2118 | "name": "stderr", 2119 | "output_type": "stream", 2120 | "text": [ 2121 | "Fetching pages: 100%|##########| 1/1 [00:00<00:00, 3.25it/s]\n" 2122 | ] 2123 | }, 2124 | { 2125 | "name": "stdout", 2126 | "output_type": "stream", 2127 | "text": [ 2128 | "Retrieval Type: dense\n", 2129 | "Result: 46. **^** \"It's official! India set up 2023 World Cup semi-final against New Zealand in 2019 rematch; Pakistan knocked out\". _Hindustan Times_. 11 November 2023. Archived from the original on 14 November 2023. Retrieved 12 November 2023.\n", 2130 | " 47. **^** \"2023 World Cup Cricket Batting Records & Stats runs\". _ESPNcricinfo_. Archived from the original on 18 October 2023. Retrieved 19 October 2023.\n", 2131 | "\n", 2132 | "Retrieval Type: dense\n", 2133 | "Result: 48. **^** \"2023 World Cup Cricket bowling Records & Stats wickets\". _ESPNcricinfo_. Archived from the original on 9 October 2023. Retrieved 10 October 2023.\n", 2134 | " 49. **^** \"India star named Player of the Tournament at ICC Men's Cricket World Cup\". _Cricket World Cup_. Archived from the original on 19 November 2023. Retrieved 19 November 2023.\n", 2135 | "\n", 2136 | "Retrieval Type: dense\n", 2137 | "Result: Main article: 2023 Cricket World Cup final\n", 2138 | "\n", 2139 | "19 November 2023 \n", 2140 | "14:00 (D/N) \n", 2141 | "Scorecard \n", 2142 | "--- \n", 2143 | "**India ** \n", 2144 | "240 (50 overs) | **v** | **Australia** \n", 2145 | "241/4 (43 overs) \n", 2146 | "---|---|--- \n", 2147 | "| | \n", 2148 | "**Australia won by 6 wickets** \n", 2149 | "Narendra Modi Stadium, Ahmedabad \n", 2150 | "--- \n", 2151 | " \n", 2152 | "## Statistics\n", 2153 | "\n", 2154 | "Main article: 2023 Cricket World Cup statistics\n", 2155 | "\n", 2156 | "### Most runs\n", 2157 | "\n", 2158 | "Retrieval Type: dense\n", 2159 | "Result: 2023 ICC Men's Cricket World Cup \n", 2160 | "--- \n", 2161 | "Dates| 5 October – 19 November 2023 \n", 2162 | "Administrator(s)| International Cricket Council \n", 2163 | "Cricket format| One Day International (ODI) \n", 2164 | "Tournament format(s)| Round-robin and knockout \n", 2165 | "Host(s)| India \n", 2166 | "Champions| Australia (6th title) \n", 2167 | "Runners-up| India \n", 2168 | "Participants| 10 \n", 2169 | "Matches| 48 \n", 2170 | "Attendance| 1,250,307 (26,048 per match) \n", 2171 | "Player of the series| Virat Kohli \n", 2172 | "Most runs| Virat Kohli (765) \n", 2173 | "Most wickets| Mohammed Shami (24)\n", 2174 | "\n", 2175 | "Retrieval Type: dense\n", 2176 | "Result: 53. ^ _**a**_ _**b**_ Pennington, Adrian. \"Behind the ICC Men's Cricket World Cup 2023 innovations with vertical video feed plus ball and player tracking\". _SVG Europe_. Retrieved 15 February 2024.\n", 2177 | " 54. ^ _**a**_ _**b**_ \"Disney sets India cricket viewership record for TV, streaming during world cup\". _Reuters_. 23 November 2023. Retrieved 14 February 2024.\n", 2178 | "\n", 2179 | "Retrieval Type: sparse\n", 2180 | "Result: Virat Kohli was named the player of the tournament and also scored the most\n", 2181 | "runs, while Mohammed Shami was the leading wicket-taker. A total of 1,250,307\n", 2182 | "spectators attended the matches, the highest number in any Cricket World Cup\n", 2183 | "to date.[1] The tournament final set viewership records in India, drawing 518\n", 2184 | "million viewers, with a peak of 57 million streaming viewers.\n", 2185 | "\n", 2186 | "## Background\n", 2187 | "\n", 2188 | "### Host selection\n", 2189 | "\n", 2190 | "Retrieval Type: sparse\n", 2191 | "Result: Participants| 10 \n", 2192 | "Matches| 48 \n", 2193 | "Attendance| 1,250,307 (26,048 per match) \n", 2194 | "Player of the series| Virat Kohli \n", 2195 | "Most runs| Virat Kohli (765) \n", 2196 | "Most wickets| Mohammed Shami (24) \n", 2197 | "Official website| cricketworldcup.com \n", 2198 | "<- 2019 _2027_ -> \n", 2199 | "**Part of a series on the** \n", 2200 | "--- \n", 2201 | "2023 Cricket World Cup / \n", 2202 | "2025 ICC Champions Trophy \n", 2203 | "CWC: Category • Commons \n", 2204 | "CT: Category • Commons \n", 2205 | "2023 Cricket World Cup \n", 2206 | "Background\n", 2207 | "\n", 2208 | "Retrieval Type: sparse\n", 2209 | "Result: 2023 ICC Men's Cricket World Cup \n", 2210 | "--- \n", 2211 | "Dates| 5 October – 19 November 2023 \n", 2212 | "Administrator(s)| International Cricket Council \n", 2213 | "Cricket format| One Day International (ODI) \n", 2214 | "Tournament format(s)| Round-robin and knockout \n", 2215 | "Host(s)| India \n", 2216 | "Champions| Australia (6th title) \n", 2217 | "Runners-up| India \n", 2218 | "Participants| 10 \n", 2219 | "Matches| 48 \n", 2220 | "Attendance| 1,250,307 (26,048 per match) \n", 2221 | "Player of the series| Virat Kohli \n", 2222 | "Most runs| Virat Kohli (765) \n", 2223 | "Most wickets| Mohammed Shami (24)\n", 2224 | "\n", 2225 | "Retrieval Type: sparse\n", 2226 | "Result: Main article: 2023 Cricket World Cup statistics\n", 2227 | "\n", 2228 | "### Most runs\n", 2229 | "\n", 2230 | "Runs | Player | Team \n", 2231 | "---|---|--- \n", 2232 | "765 | Virat Kohli | India \n", 2233 | "597 | Rohit Sharma | India \n", 2234 | "594 | Quinton de Kock | South Africa \n", 2235 | "578 | Rachin Ravindra | New Zealand \n", 2236 | "552 | Daryl Mitchell | New Zealand \n", 2237 | " \n", 2238 | " * Source: ESPNcricinfo[47]\n", 2239 | "\n", 2240 | "### Most wickets\n", 2241 | "\n" 2242 | ] 2243 | }, 2244 | { 2245 | "name": "stderr", 2246 | "output_type": "stream", 2247 | "text": [ 2248 | "/var/folders/kz/0m0zvdwn54798h3cfz47bcjw0000gn/T/ipykernel_29711/394055649.py:58: LangChainDeprecationWarning: The method `BaseRetriever.get_relevant_documents` was deprecated in langchain-core 0.1.46 and will be removed in 1.0. Use :meth:`~invoke` instead.\n", 2249 | " sparse_results = bm25_retriever.get_relevant_documents(query)\n" 2250 | ] 2251 | } 2252 | ], 2253 | "source": [ 2254 | "import faiss\n", 2255 | "from langchain_community.vectorstores import FAISS\n", 2256 | "from langchain_community.docstore.in_memory import InMemoryDocstore\n", 2257 | "from langchain_openai import OpenAIEmbeddings\n", 2258 | "from langchain_core.documents import Document\n", 2259 | "from langchain_community.document_loaders import AsyncHtmlLoader\n", 2260 | "from langchain_community.document_transformers import Html2TextTransformer\n", 2261 | "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", 2262 | "from langchain_community.retrievers import BM25Retriever\n", 2263 | "\n", 2264 | "# Step 1: Load data from a URL (Wikipedia page)\n", 2265 | "url = \"https://en.wikipedia.org/wiki/2023_Cricket_World_Cup\"\n", 2266 | "loader = AsyncHtmlLoader(url)\n", 2267 | "data = loader.load()\n", 2268 | "\n", 2269 | "# Step 2: Transform the HTML content to plain text\n", 2270 | "html2text = Html2TextTransformer()\n", 2271 | "data_transformed = html2text.transform_documents(data)\n", 2272 | "\n", 2273 | "# Step 3: Split the text into smaller chunks using RecursiveCharacterTextSplitter\n", 2274 | "text_splitter = RecursiveCharacterTextSplitter(\n", 2275 | " chunk_size=500, # Number of characters in each chunk\n", 2276 | " chunk_overlap=200 # Number of overlapping characters between chunks\n", 2277 | ")\n", 2278 | "chunks = text_splitter.split_text(data_transformed[0].page_content)\n", 2279 | "\n", 2280 | "# Step 4: Dense Retrieval (FAISS + OpenAI Embeddings)\n", 2281 | "\n", 2282 | "# Initialize OpenAI Embeddings model for dense retrieval\n", 2283 | "embedding_model = OpenAIEmbeddings(model=\"text-embedding-ada-002\")\n", 2284 | "\n", 2285 | "# Initialize FAISS index for dense retrieval\n", 2286 | "embedding_dim = len(embedding_model.embed_query(\"hello world\"))\n", 2287 | "index = faiss.IndexFlatL2(embedding_dim)\n", 2288 | "\n", 2289 | "# Create an in-memory document store to support adding documents\n", 2290 | "docstore = InMemoryDocstore()\n", 2291 | "\n", 2292 | "# Initialize FAISS vector store\n", 2293 | "vector_store = FAISS(embedding_function=embedding_model, index=index, docstore=docstore, index_to_docstore_id={})\n", 2294 | "\n", 2295 | "# Add chunks to FAISS vector store\n", 2296 | "documents = [Document(page_content=chunk) for chunk in chunks]\n", 2297 | "vector_store.add_documents(documents)\n", 2298 | "\n", 2299 | "# Step 5: Sparse Retrieval (BM25 using LangChain's BM25Retriever)\n", 2300 | "\n", 2301 | "# Initialize BM25Retriever\n", 2302 | "bm25_retriever = BM25Retriever.from_documents(documents)\n", 2303 | "\n", 2304 | "# Step 6: Hybrid Retrieval Strategy\n", 2305 | "\n", 2306 | "def hybrid_search(query, k=5):\n", 2307 | " # Step 6.1: Perform dense retrieval using FAISS\n", 2308 | " dense_results = vector_store.similarity_search(query=query, k=k)\n", 2309 | " \n", 2310 | " # Step 6.2: Perform sparse retrieval using BM25Retriever\n", 2311 | " sparse_results = bm25_retriever.get_relevant_documents(query)\n", 2312 | " \n", 2313 | " # Limit sparse results to top-k\n", 2314 | " sparse_results = sparse_results[:k]\n", 2315 | " \n", 2316 | " # Step 6.3: Combine dense and sparse results\n", 2317 | " combined_results = []\n", 2318 | " for dense_doc in dense_results:\n", 2319 | " combined_results.append((\"dense\", dense_doc.page_content))\n", 2320 | "\n", 2321 | " for sparse_doc in sparse_results:\n", 2322 | " combined_results.append((\"sparse\", sparse_doc.page_content))\n", 2323 | "\n", 2324 | " # Optionally, re-rank or further process combined results\n", 2325 | " return combined_results\n", 2326 | "\n", 2327 | "# Step 7: Perform a hybrid search\n", 2328 | "query = \"Virat Kohli records in 2023 Cricket World Cup\"\n", 2329 | "hybrid_results = hybrid_search(query)\n", 2330 | "\n", 2331 | "# Step 8: Display the results\n", 2332 | "for retrieval_type, result in hybrid_results:\n", 2333 | " print(f\"Retrieval Type: {retrieval_type}\")\n", 2334 | " print(f\"Result: {result}\\n\")\n" 2335 | ] 2336 | }, 2337 | { 2338 | "cell_type": "markdown", 2339 | "metadata": {}, 2340 | "source": [ 2341 | "## 3. Post Retrieval Stage" 2342 | ] 2343 | }, 2344 | { 2345 | "cell_type": "markdown", 2346 | "metadata": {}, 2347 | "source": [ 2348 | "At the post-retrieval stage the approaches of reranking and compression help in providing better context to the LLM for generation." 2349 | ] 2350 | }, 2351 | { 2352 | "cell_type": "markdown", 2353 | "metadata": {}, 2354 | "source": [ 2355 | "#### Compression" 2356 | ] 2357 | }, 2358 | { 2359 | "cell_type": "markdown", 2360 | "metadata": {}, 2361 | "source": [ 2362 | "In prompt compression, language models are used to detect and remove unimportant and irrelevant tokens" 2363 | ] 2364 | }, 2365 | { 2366 | "cell_type": "code", 2367 | "execution_count": 47, 2368 | "metadata": {}, 2369 | "outputs": [], 2370 | "source": [ 2371 | "document_to_compress=retrieved_docs[0].page_content" 2372 | ] 2373 | }, 2374 | { 2375 | "cell_type": "code", 2376 | "execution_count": 48, 2377 | "metadata": {}, 2378 | "outputs": [], 2379 | "source": [ 2380 | "compress_prompt = f\"Compress the following document into very short sentences, retaining only the extremely essential information:\\n\\n{document_to_compress}\"" 2381 | ] 2382 | }, 2383 | { 2384 | "cell_type": "code", 2385 | "execution_count": 49, 2386 | "metadata": {}, 2387 | "outputs": [], 2388 | "source": [ 2389 | "# Make the API call passing the augmented prompt to the LLM\n", 2390 | "response = client.chat.completions.create(\n", 2391 | " model=\"gpt-4o-mini\",\n", 2392 | " messages=\t[\n", 2393 | " {\"role\": \"user\", \"content\": compress_prompt}\n", 2394 | " ]\n", 2395 | ")\n", 2396 | "\n", 2397 | "# Extract the answer from the response object\n", 2398 | "answer=response.choices[0].message.content" 2399 | ] 2400 | }, 2401 | { 2402 | "cell_type": "code", 2403 | "execution_count": 50, 2404 | "metadata": {}, 2405 | "outputs": [ 2406 | { 2407 | "name": "stdout", 2408 | "output_type": "stream", 2409 | "text": [ 2410 | "The 2023 ICC Men's Cricket World Cup took place in India from October 5 to\n", 2411 | "November 19, 2023. It was the 13th ODI tournament with ten teams. Australia won,\n", 2412 | "defeating India in the final. The match was held at the Narendra Modi Stadium in\n", 2413 | "Ahmedabad. Attendance exceeded 1.25 million, and the final had 518 million\n", 2414 | "viewers. Virat Kohli was Player of the Series with 765 runs. Mohammed Shami took\n", 2415 | "the most wickets, with 24. The event was postponed from early 2023 due to\n", 2416 | "COVID-19. New penalties for slow over-rates were introduced. The format included\n", 2417 | "a round-robin stage and knockout rounds. Final Match: - Date: 19 November 2023\n", 2418 | "- India: 240 (50 overs) - Australia: 241/4 (43 overs) - Australia won by 6\n", 2419 | "wickets. Top Performers: - Most Runs: Virat Kohli (765), Rohit Sharma (597),\n", 2420 | "Quinton de Kock (594). - Most Wickets: Mohammed Shami (24), Adam Zampa (23),\n", 2421 | "Dilshan Madushanka (21).\n" 2422 | ] 2423 | } 2424 | ], 2425 | "source": [ 2426 | "print(textwrap.fill(answer, width=80))" 2427 | ] 2428 | }, 2429 | { 2430 | "cell_type": "code", 2431 | "execution_count": 51, 2432 | "metadata": {}, 2433 | "outputs": [ 2434 | { 2435 | "name": "stdout", 2436 | "output_type": "stream", 2437 | "text": [ 2438 | "The 2023 ICC Men's Cricket World Cup, held in India from October 5 to November\n", 2439 | "19, 2023, marked the 13th edition of this prestigious One Day International\n", 2440 | "(ODI) tournament, featuring ten national teams. Australia emerged as champions,\n", 2441 | "claiming their sixth title by defeating India in the final at the Narendra Modi\n", 2442 | "Stadium in Ahmedabad. The tournament attracted a record attendance of over 1.25\n", 2443 | "million spectators and set viewership records in India, with 518 million viewers\n", 2444 | "for the final. Virat Kohli was named Player of the Series, scoring the most runs\n", 2445 | "(765), while Mohammed Shami led in wickets taken (24). The event was initially\n", 2446 | "scheduled for early 2023 but was postponed due to the COVID-19 pandemic, and it\n", 2447 | "introduced new penalties for slow over-rates. The tournament format included a\n", 2448 | "round-robin group stage followed by knockout rounds, culminating in a highly\n", 2449 | "anticipated final. Main article: 2023 Cricket World Cup final 19 November 2023\n", 2450 | "14:00 (D/N) Scorecard --- **India ** 240 (50 overs) | **v** |\n", 2451 | "**Australia** 241/4 (43 overs) ---|---|--- | | **Australia won by 6\n", 2452 | "wickets** Narendra Modi Stadium, Ahmedabad --- ## Statistics Main\n", 2453 | "article: 2023 Cricket World Cup statistics ### Most runs Runs | Player |\n", 2454 | "Team ---|---|--- 765 | Virat Kohli | India 597 | Rohit Sharma | India\n", 2455 | "594 | Quinton de Kock | South Africa 578 | Rachin Ravindra | New Zealand\n", 2456 | "552 | Daryl Mitchell | New Zealand * Source: ESPNcricinfo[47] ### Most\n", 2457 | "wickets Wickets | Player | Team ---|---|--- 24 | Mohammed Shami |\n", 2458 | "India 23 | Adam Zampa | Australia 21 | Dilshan Madushanka | Sri Lanka\n", 2459 | "20 | Jasprit Bumrah | India 20 | Gerald Coetzee | South Africa *\n", 2460 | "Source: ESPNcricinfo[48] ### Team of the tournament\n" 2461 | ] 2462 | } 2463 | ], 2464 | "source": [ 2465 | "print(textwrap.fill(document_to_compress, width=80))" 2466 | ] 2467 | }, 2468 | { 2469 | "cell_type": "markdown", 2470 | "metadata": {}, 2471 | "source": [ 2472 | "---" 2473 | ] 2474 | }, 2475 | { 2476 | "cell_type": "markdown", 2477 | "metadata": {}, 2478 | "source": [ 2479 | " \n", 2480 | "\n", 2481 | "Hi! I'm Abhinav! I am an entrepreneur and Vice President of Artificial Intelligence at Yarnit. I have spent over 15 years consulting and leadership roles in data science, machine learning and AI. My current focus is in the applied Generative AI domain focussing on solving enterprise needs through contextual intelligence. I'm passionate about AI advancements constantly exploring emerging technologies to push the boundaries and create positive impacts in the world. Let’s build the future, together!\n", 2482 | "\n", 2483 | "[If you haven't already, please subscribe to the MEAP of A Simple Guide to Retrieval Augmented Generation here](https://mng.bz/8wdg)\n", 2484 | "\n", 2485 | "\n", 2486 | " \"New\n", 2487 | "\n", 2488 | "\n", 2489 | "#### If you'd like to chat, I'd be very happy to connect\n", 2490 | "\n", 2491 | "[![GitHub followers](https://img.shields.io/badge/Github-000000?style=for-the-badge&logo=github&logoColor=black&color=orange)](https://github.com/abhinav-kimothi)\n", 2492 | "[![LinkedIn](https://img.shields.io/badge/LinkedIn-000000?style=for-the-badge&logo=linkedin&logoColor=orange&color=black)](https://www.linkedin.com/comm/mynetwork/discovery-see-all?usecase=PEOPLE_FOLLOWS&followMember=abhinav-kimothi)\n", 2493 | "[![Medium](https://img.shields.io/badge/Medium-000000?style=for-the-badge&logo=medium&logoColor=black&color=orange)](https://medium.com/@abhinavkimothi)\n", 2494 | "[![Insta](https://img.shields.io/badge/Instagram-000000?style=for-the-badge&logo=instagram&logoColor=orange&color=black)](https://www.instagram.com/akaiworks/)\n", 2495 | "[![Mail](https://img.shields.io/badge/email-000000?style=for-the-badge&logo=gmail&logoColor=black&color=orange)](mailto:abhinav.kimothi.ds@gmail.com)\n", 2496 | "[![X](https://img.shields.io/badge/Follow-000000?style=for-the-badge&logo=X&logoColor=orange&color=black)](https://twitter.com/abhinav_kimothi)\n", 2497 | "[![Linktree](https://img.shields.io/badge/Linktree-000000?style=for-the-badge&logo=linktree&logoColor=black&color=orange)](https://linktr.ee/abhinavkimothi)\n", 2498 | "[![Gumroad](https://img.shields.io/badge/Gumroad-000000?style=for-the-badge&logo=gumroad&logoColor=orange&color=black)](https://abhinavkimothi.gumroad.com/)\n", 2499 | "\n", 2500 | "---" 2501 | ] 2502 | }, 2503 | { 2504 | "cell_type": "markdown", 2505 | "metadata": {}, 2506 | "source": [] 2507 | } 2508 | ], 2509 | "metadata": { 2510 | "kernelspec": { 2511 | "display_name": ".ch6", 2512 | "language": "python", 2513 | "name": "python3" 2514 | }, 2515 | "language_info": { 2516 | "codemirror_mode": { 2517 | "name": "ipython", 2518 | "version": 3 2519 | }, 2520 | "file_extension": ".py", 2521 | "mimetype": "text/x-python", 2522 | "name": "python", 2523 | "nbconvert_exporter": "python", 2524 | "pygments_lexer": "ipython3", 2525 | "version": "3.13.2" 2526 | } 2527 | }, 2528 | "nbformat": 4, 2529 | "nbformat_minor": 2 2530 | } 2531 | -------------------------------------------------------------------------------- /Chapters/Readme.md: -------------------------------------------------------------------------------- 1 | # A Simple Introduction to RAG (Code Snippets) 2 | 3 | 4 | New MEAP 5 | 6 | 7 | 8 | - Chapter 3 - Indexing Pipeline : Creating a knowledge base for RAG based applications [First draft Released](./Chapters/Chapter-03/indexing_pipeline.ipynb) 9 | 10 | - Chapter 4 - Generation Pipeline: Real time interaction for contextual responses [First draft Released](./Chapters/Chapter-04/generation_pipeline.ipynb) 11 | 12 | - Chapter 5 - RAG Evaluation : Checking accuracy, relevance and faithfulness [First draft Released](./Chapters/Chapter-05/rag_evaluations.ipynb) 13 | 14 | - Chapter 6 - Evolving RAGOps Stack : Technologies that make RAG possible [First draft Released](./Chapters/Chapter-06/advanced_rag.ipynb.ipynb) 15 | 16 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Abhinav Kimothi 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # A Simple Guide to Retrieval Augmented Generation 2 | This repository is the source code for examples and illustrations discussed in the book - [A Simple Guide to Retrieval Augmented Generation](https://mng.bz/8wdg) published by [Manning Publications](https://www.manning.com/?utm_source=kimothi&utm_medium=affiliate&utm_campaign=affiliate&a_aid=kimothi) 3 | 4 | 5 | New MEAP 6 | 7 | 8 | Retrieval Augmented Generation, or RAG, stands as a pivotal technique shaping the landscape of the applied generative AI. A novel concept introduced by Lewis et al in their seminal paper Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks (https://arxiv.org/abs/2005.11401), RAG has swiftly emerged as a cornerstone, enhancing reliability and trustworthiness in the outputs from Large Language Models (LLMs) 9 | 10 | About the book 11 | --- 12 | This book is a foundational guide designed particularly for beginners looking for an easy, yet comprehensive introduction to Retrieval Augmented Generation. This book does not go deep into the technical nitty-gritties of RAG rather provides an overview. Data Scientists, Data Engineers, ML Engineers, Software Developers, Technology Leaders, Students and Academicians interested in generative AI powered application development will find this book valuable. Upon completing this book, you can expect to: 13 | 14 | - Develop a solid understanding of RAG fundamentals, the components of a RAG enabled system and its practical applications. 15 | 16 | - Know what a non-parametric knowledge base for RAG means and how is it created. 17 | 18 | - Gain knowledge about developing a RAG enabled system with details about the indexing pipeline and the generation pipeline. 19 | - Gain deep insights into the evaluation of RAG enabled systems and modularised evaluation strategies 20 | 21 | - Familiarize yourself with advanced RAG strategies and the evolving landscape 22 | 23 | - Acquire knowledge of available tools, technologies and frameworks for building and deploying production grade RAG systems 24 | 25 | - Get an understanding of the current limitations of RAG and an exposure to popular emerging techniques for further exploration 26 | 27 | __Note: This book is still in development and is scheduled to be completed in the next few months__ 28 | 29 | Link to the [official source code repository](https://github.com/abhinav-kimothi/A-Simple-Introduction-to-RAG) 30 | 31 | Link to [join the MEAP at manning.com](https://mng.bz/8wdg) 32 | 33 | To download a copy of this repository, click on the [Download ZIP](https://github.com/abhinav-kimothi/A-Simple-Introduction-to-RAG/archive/refs/heads/main.zip) button or execute the following command in your terminal: 34 | 35 | ``` 36 | git clone https://github.com/abhinav-kimothi/A-Simple-Introduction-to-RAG.git 37 | ``` 38 | 39 | 40 | Table of Contents 41 | --- 42 | 43 | The first three chapters of the book have been released as a part of the Manning Early Access Program. You can [join the MEAP here](https://mng.bz/8wdg) 44 | 45 | - Chapter 1 - Large Language Models and the Need for Retrieval Augmented Generation [First draft Released] 46 | 47 | - Chapter 2 - RAG-enabled systems and their design [First draft Released] 48 | 49 | - Chapter 3 - Indexing Pipeline : Creating a knowledge base for RAG based applications [First draft Released] [Notebook](./Chapters/Chapter-03/indexing_pipeline.ipynb) 50 | 51 | - Chapter 4 - Generation Pipeline: Real time interaction for contextual responses [Notebook](./Chapters/Chapter-04/generation_pipeline.ipynb) 52 | 53 | - Chapter 5 - RAG Evaluation : Checking accuracy, relevance and faithfulness [Notebook](./Chapters/Chapter-05/rag_evaluations.ipynb) 54 | 55 | - Chapter 6 - Progression of RAG systems : Naive to Advanced to Modular [Notebook](./Chapters/Chapter-06/advanced_rag.ipynb.ipynb) 56 | 57 | - Chapter 7 - Evolving RAGOps Stack : Technologies that make RAG possible 58 | 59 | - Chapter 8 - Nuances : Comparison with fine-tuning, multimodal and agentic RAG 60 | 61 | - Chapter 9 - Cutting Edge : Best practices and further exploration 62 | 63 | Why join MEAP? 64 | --- 65 | By joining the Manning Early Access Program, you'll get: 66 | - Immediate access to the book's current draft and all future updates 67 | - A chance to provide feedback and shape the final content 68 | - Exclusive discounts and early-bird offers 69 | 70 | Code 71 | --- 72 | Code Snippets are organised in the Chapters Directory by [Chapters](./Chapters) 73 | 74 | 75 | 76 | - Chapter 1 - Does not have any code 77 | 78 | - Chapter 2 - Does not have any code 79 | 80 | - [Chapter 3 - This notebook](./Chapters/Chapter-03/indexing_pipeline.ipynb) outlines the indexing pipeline. 81 | 82 | A knowledge base is created for the 2023 Cricket World Cup based on the Wikipedia Article on the topic. We use __AsyncHtmlLoader__ and __Html2TextTransformer__ to load the article, chunk the text using __RecursiveCharacterTextSplitter__, use __text-embedding-3-large__ from OpenAI to convert chunks into vectors and use __FAISS__ as the vector index to store the embeddings. 83 | 84 | - [Chapter 4 - This notebook](./Chapters/Chapter-04/indexing_pipeline.ipynb) outlines the generation pipeline. 85 | 86 | We use the knowledge base created in Chapter 03 on the wikipedia article on 2023 Cricket World Cup. We load the __FAISS__ index and use the __similarity search__ function to retrieve chunks. We then Augment the user query with the retrieved chunk and use __GPT 4o__ model from OpenAI to generate the response. 87 | 88 | This notebook also includes functions that can be used to generate answers for different queries that a user may want to ask. 89 | 90 | Additionally, this chapter contains [another notebook](./Chapters/Chapter-04/xtra_tfidf_bm25_retriever.ipynb) that shows the usage of TF-IDF and BM25 as retriever algorithms. 91 | 92 | - [Chapter 5 - This notebook](./Chapters/Chapter-05/rag_evaluations.ipynb) evaluates the RAG pipeline created in chapters 3 and 4 using the __RAGAS__ framework. 93 | 94 | Additionally, this chapter includes a [notebook](./Chapters/Chapter-05/xtra_benchmarking.ipynb) that uses LangChain Benchmarks to benchmark our RAG pipeline on LangChain QnA docs. 95 | 96 | - [Chapter 6 - This notebook](./Chapters/Chapter-06/advanced_rag.ipynb) demonstrates selected Index Optimization, Query Optimization, Retrieval Strategies and Post Retrieval Compression techniques. 97 | 98 | __Note: This is a WIP repository and subsequent chapters will be released on an ongoing basis__ 99 | 100 | Setup 101 | --- 102 | 103 | Clone this repository to your local machine: 104 | 105 | ``` 106 | git clone https://github.com/abhinav-kimothi/A-Simple-Introduction-to-RAG.git 107 | ``` 108 | 109 | Navigate to the cloned repository: 110 | 111 | 112 | cd A-Simple-Introduction-to-RAG 113 | 114 | It's recommended to use a virtual environment to avoid conflicts with other projects or system-wide Python packages. 115 | 116 | Run the following command to create a virtual environment named myenv (you can name it anything you like): 117 | 118 | ``` 119 | python3 -m venv .myenv 120 | ``` 121 | Activate the Virtual Environment: 122 | 123 | - On Windows, activate the virtual environment by running: 124 | 125 | ``` 126 | .myenv\Scripts\activate.bat 127 | ``` 128 | 129 | - On macOS and Linux, activate it with: 130 | 131 | ``` 132 | source venv/bin/activate 133 | ``` 134 | 135 | Install the package requirements from the requirements.txt by executing the following pip installation command: 136 | 137 | 138 | ``` 139 | pip install -r requirements.txt 140 | ``` 141 | 142 | _recommended_: Store your API keys in a .env file: 143 | ``` 144 | OPENAI_API_KEY= 145 | LANGCHAIN_API_KEY= 146 | 147 | ### You can also look at the .\example_dot_env file in this repo for the structure. Remeber to rename to .env 148 | ``` 149 | 150 | The notebooks in this repository need __python version > 3.11.1__ 151 | 152 | Feedback & Contribution 153 | --- 154 | I'd love to hear what you think about the code here and the book in general. I appreciate and welcome all feedback. You can either post your thoughts, questions, critiques and ideas in the [Discussion forum of this repo](https://github.com/abhinav-kimothi/A-Simple-Introduction-to-RAG/discussions) or if you've purchased the [MEAP](https://mng.bz/8wdg) (https://mng.bz/8wdg) you can also provide your feedback on the [livebook on manning.com](https://livebook.manning.com/book/a-simple-guide-to-retrieval-augmented-generation). 155 | 156 | If you notice any errors or issues with the code, please raise an [issue here in the repo](https://github.com/abhinav-kimothi/A-Simple-Introduction-to-RAG/issues) 157 | 158 | About me 159 | --- 160 | Hi! I'm Abhinav! I am an entrepreneur and Vice President of Artificial Intelligence at Yarnit. I have spent over 15 years consulting and leadership roles in data science, machine learning and AI. My current focus is in the applied Generative AI domain focussing on solving enterprise needs through contextual intelligence. I'm passionate about AI advancements constantly exploring emerging technologies to push the boundaries and create positive impacts in the world. Let’s build the future, together! 161 | 162 | #### If you'd like to chat, I'd be very happy to connect 163 | 164 | [![GitHub followers](https://img.shields.io/badge/Github-000000?style=for-the-badge&logo=github&logoColor=black&color=orange)](https://github.com/abhinav-kimothi) 165 | [![LinkedIn](https://img.shields.io/badge/LinkedIn-000000?style=for-the-badge&logo=linkedin&logoColor=orange&color=black)](https://www.linkedin.com/comm/mynetwork/discovery-see-all?usecase=PEOPLE_FOLLOWS&followMember=abhinav-kimothi) 166 | [![Medium](https://img.shields.io/badge/Medium-000000?style=for-the-badge&logo=medium&logoColor=black&color=orange)](https://medium.com/@abhinavkimothi) 167 | [![Insta](https://img.shields.io/badge/Instagram-000000?style=for-the-badge&logo=instagram&logoColor=orange&color=black)](https://www.instagram.com/akaiworks/) 168 | [![Mail](https://img.shields.io/badge/email-000000?style=for-the-badge&logo=gmail&logoColor=black&color=orange)](mailto:abhinav.kimothi.ds@gmail.com) 169 | [![X](https://img.shields.io/badge/Follow-000000?style=for-the-badge&logo=X&logoColor=orange&color=black)](https://twitter.com/abhinav_kimothi) 170 | [![Linktree](https://img.shields.io/badge/Linktree-000000?style=for-the-badge&logo=linktree&logoColor=black&color=orange)](https://linktr.ee/abhinavkimothi) 171 | [![Gumroad](https://img.shields.io/badge/Gumroad-000000?style=for-the-badge&logo=gumroad&logoColor=orange&color=black)](https://abhinavkimothi.gumroad.com/) 172 | 173 | 174 | [If you haven't already, please subscribe to the MEAP of A Simple Guide to Retrieval Augmented Generation here](https://mng.bz/8wdg) 175 | 176 | 177 | New MEAP 178 | 179 | 180 | --- 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | -------------------------------------------------------------------------------- /example_dot_env: -------------------------------------------------------------------------------- 1 | OPENAI_API_KEY=sk-proj-****** 2 | LANGCHAIN_API_KEY=lsv2_****** -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | langchain==0.3.19 2 | langchain_community==0.3.18 3 | bs4==0.0.2 4 | html2text==2024.2.26 5 | matplotlib==3.10.0 6 | lxml==5.3.1 7 | langchain_huggingface==0.1.2 8 | openai==1.64.0 9 | langchain-openai==0.3.7 10 | faiss-cpu==1.10.0 11 | python-dotenv==1.0.1 12 | scikit-learn==1.6.1 13 | rank_bm25==0.2.2 14 | ragas==0.2.13 15 | rapidfuzz==3.12.1 --------------------------------------------------------------------------------