├── .gitattributes
├── .gitignore
├── Assets
    └── Images
    │   ├── 3.1.png
    │   ├── 3.2.png
    │   ├── 3.3.png
    │   ├── 3.4.png
    │   ├── 3.5.png
    │   ├── 4.+1.png
    │   ├── 4.+2.png
    │   ├── 4.1.png
    │   ├── 4.2.png
    │   ├── 4.3.png
    │   ├── 5.1 1.png
    │   ├── 5.1.png
    │   ├── 6.1.png
    │   ├── 6.2.png
    │   ├── MEAP-HI 2.png
    │   ├── MEAP-HI.png
    │   ├── NewMEAP.png
    │   ├── NewMEAPFooter.png
    │   ├── NewMEAPHeader.png
    │   └── profile_s.png
├── Chapters
    ├── Chapter-03
    │   └── indexing_pipeline.ipynb
    ├── Chapter-04
    │   ├── generation_pipeline.ipynb
    │   └── xtra_tfidf_bm25_retriever.ipynb
    ├── Chapter-05
    │   ├── evaluators.py
    │   └── rag_evaluations.ipynb
    ├── Chapter-06
    │   └── advanced_rag.ipynb
    └── Readme.md
├── LICENSE
├── README.md
├── example_dot_env
└── requirements.txt


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
105 | __pypackages__/
106 | 
107 | # Celery stuff
108 | celerybeat-schedule
109 | celerybeat.pid
110 | 
111 | # SageMath parsed files
112 | *.sage.py
113 | 
114 | # Environments
115 | .env
116 | .venv
117 | env/
118 | venv/
119 | ENV/
120 | env.bak/
121 | venv.bak/
122 | 
123 | # Spyder project settings
124 | .spyderproject
125 | .spyproject
126 | 
127 | # Rope project settings
128 | .ropeproject
129 | 
130 | # mkdocs documentation
131 | /site
132 | 
133 | # mypy
134 | .mypy_cache/
135 | .dmypy.json
136 | dmypy.json
137 | 
138 | # Pyre type checker
139 | .pyre/
140 | 
141 | # pytype static type analyzer
142 | .pytype/
143 | 
144 | # Cython debug symbols
145 | cython_debug/
146 | 
147 | # PyCharm
148 | #  JetBrains specific template is maintainted in a separate JetBrains.gitignore that can
149 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
150 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
151 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
152 | #.idea/
153 | 
154 | .myenv
155 | .env
156 | requirements 2.txt
157 | temp.ipynb
158 | Chapters/Chapter-05/xtra_benchmarking.ipynb
159 | xtra_benchmarking.ipynb
160 | *.faiss
161 | *.pkl
162 | 
163 | 


--------------------------------------------------------------------------------
/Assets/Images/3.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abhinav-kimothi/A-Simple-Guide-to-RAG/a827e1592ea45a3b0b7ee8ab25159819ce64b130/Assets/Images/3.1.png


--------------------------------------------------------------------------------
/Assets/Images/3.2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abhinav-kimothi/A-Simple-Guide-to-RAG/a827e1592ea45a3b0b7ee8ab25159819ce64b130/Assets/Images/3.2.png


--------------------------------------------------------------------------------
/Assets/Images/3.3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abhinav-kimothi/A-Simple-Guide-to-RAG/a827e1592ea45a3b0b7ee8ab25159819ce64b130/Assets/Images/3.3.png


--------------------------------------------------------------------------------
/Assets/Images/3.4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abhinav-kimothi/A-Simple-Guide-to-RAG/a827e1592ea45a3b0b7ee8ab25159819ce64b130/Assets/Images/3.4.png


--------------------------------------------------------------------------------
/Assets/Images/3.5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abhinav-kimothi/A-Simple-Guide-to-RAG/a827e1592ea45a3b0b7ee8ab25159819ce64b130/Assets/Images/3.5.png


--------------------------------------------------------------------------------
/Assets/Images/4.+1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abhinav-kimothi/A-Simple-Guide-to-RAG/a827e1592ea45a3b0b7ee8ab25159819ce64b130/Assets/Images/4.+1.png


--------------------------------------------------------------------------------
/Assets/Images/4.+2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abhinav-kimothi/A-Simple-Guide-to-RAG/a827e1592ea45a3b0b7ee8ab25159819ce64b130/Assets/Images/4.+2.png


--------------------------------------------------------------------------------
/Assets/Images/4.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abhinav-kimothi/A-Simple-Guide-to-RAG/a827e1592ea45a3b0b7ee8ab25159819ce64b130/Assets/Images/4.1.png


--------------------------------------------------------------------------------
/Assets/Images/4.2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abhinav-kimothi/A-Simple-Guide-to-RAG/a827e1592ea45a3b0b7ee8ab25159819ce64b130/Assets/Images/4.2.png


--------------------------------------------------------------------------------
/Assets/Images/4.3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abhinav-kimothi/A-Simple-Guide-to-RAG/a827e1592ea45a3b0b7ee8ab25159819ce64b130/Assets/Images/4.3.png


--------------------------------------------------------------------------------
/Assets/Images/5.1 1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abhinav-kimothi/A-Simple-Guide-to-RAG/a827e1592ea45a3b0b7ee8ab25159819ce64b130/Assets/Images/5.1 1.png


--------------------------------------------------------------------------------
/Assets/Images/5.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abhinav-kimothi/A-Simple-Guide-to-RAG/a827e1592ea45a3b0b7ee8ab25159819ce64b130/Assets/Images/5.1.png


--------------------------------------------------------------------------------
/Assets/Images/6.1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abhinav-kimothi/A-Simple-Guide-to-RAG/a827e1592ea45a3b0b7ee8ab25159819ce64b130/Assets/Images/6.1.png


--------------------------------------------------------------------------------
/Assets/Images/6.2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abhinav-kimothi/A-Simple-Guide-to-RAG/a827e1592ea45a3b0b7ee8ab25159819ce64b130/Assets/Images/6.2.png


--------------------------------------------------------------------------------
/Assets/Images/MEAP-HI 2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abhinav-kimothi/A-Simple-Guide-to-RAG/a827e1592ea45a3b0b7ee8ab25159819ce64b130/Assets/Images/MEAP-HI 2.png


--------------------------------------------------------------------------------
/Assets/Images/MEAP-HI.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abhinav-kimothi/A-Simple-Guide-to-RAG/a827e1592ea45a3b0b7ee8ab25159819ce64b130/Assets/Images/MEAP-HI.png


--------------------------------------------------------------------------------
/Assets/Images/NewMEAP.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abhinav-kimothi/A-Simple-Guide-to-RAG/a827e1592ea45a3b0b7ee8ab25159819ce64b130/Assets/Images/NewMEAP.png


--------------------------------------------------------------------------------
/Assets/Images/NewMEAPFooter.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abhinav-kimothi/A-Simple-Guide-to-RAG/a827e1592ea45a3b0b7ee8ab25159819ce64b130/Assets/Images/NewMEAPFooter.png


--------------------------------------------------------------------------------
/Assets/Images/NewMEAPHeader.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abhinav-kimothi/A-Simple-Guide-to-RAG/a827e1592ea45a3b0b7ee8ab25159819ce64b130/Assets/Images/NewMEAPHeader.png


--------------------------------------------------------------------------------
/Assets/Images/profile_s.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abhinav-kimothi/A-Simple-Guide-to-RAG/a827e1592ea45a3b0b7ee8ab25159819ce64b130/Assets/Images/profile_s.png


--------------------------------------------------------------------------------
/Chapters/Chapter-04/generation_pipeline.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "<a href=\"https://mng.bz/8wdg\" target=\"_blank\">\n",
  8 |     "    <img src=\"../../Assets/Images/NewMEAPHeader.png\" alt=\"New MEAP\" style=\"width: 100%;\" />\n",
  9 |     "</a>\n"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "# Chapter 04 - Generation Pipeline: Generating Contextual LLM Responses"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "### Welcome to chapter 4 of A Simple Introduction to Retrieval Augmented Generation.\n",
 24 |     "\n",
 25 |     "In this chapter, we introduce the concepts behind the real-time generation pipeline that uses the knowledge base created by the indexing pipeline. This will complete the development of a simple RAG system.\n",
 26 |     "\n",
 27 |     "The generation pipeline consists of three steps -\n",
 28 |     "\n",
 29 |     "1. Retrieval\n",
 30 |     "2. Augmentation\n",
 31 |     "3. Generation"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "markdown",
 36 |    "metadata": {},
 37 |    "source": [
 38 |     "<img src=\"../../Assets/Images/4.1.png\" width=800>"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "markdown",
 43 |    "metadata": {},
 44 |    "source": [
 45 |     "## Installing Dependencies"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "markdown",
 50 |    "metadata": {},
 51 |    "source": [
 52 |     "All the necessary libraries for running this notebook along with their versions can be found in __requirements.txt__ file in the root directory of this repository\n",
 53 |     "\n",
 54 |     "You should go to the root directory and run the following command to install the libraries\n",
 55 |     "\n",
 56 |     "```\n",
 57 |     "pip install -r requirements.txt\n",
 58 |     "```\n",
 59 |     "\n",
 60 |     "This is the recommended method of installing the dependencies\n",
 61 |     "\n",
 62 |     "___\n",
 63 |     "Alternatively, you can run the command from this notebook too. The relative path may vary"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": 1,
 69 |    "metadata": {},
 70 |    "outputs": [
 71 |     {
 72 |      "name": "stdout",
 73 |      "output_type": "stream",
 74 |      "text": [
 75 |       "\n",
 76 |       "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip available: \u001b[0m\u001b[31;49m22.3.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.2\u001b[0m\n",
 77 |       "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
 78 |       "Note: you may need to restart the kernel to use updated packages.\n"
 79 |      ]
 80 |     }
 81 |    ],
 82 |    "source": [
 83 |     "%pip install -r ../../requirements.txt --quiet"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "markdown",
 88 |    "metadata": {},
 89 |    "source": [
 90 |     "## 1. Load the Vector Index"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "markdown",
 95 |    "metadata": {},
 96 |    "source": [
 97 |     "In Chapter 3, we were working on indexing the Wikipedia page for the 2023 cricket world cup. If you recall we had used embeddings from OpenAI to encode the text and used FAISS as the vector index to store the embeddings. We also stored the FAISS index in a local directory. Let’s reuse this index"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "markdown",
102 |    "metadata": {},
103 |    "source": [
104 |     "Note: You will need an __OpenAI API Key__ which can be obtained from [OpenAI](https://platform.openai.com/api-keys) to reuse the embeddings."
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "markdown",
109 |    "metadata": {},
110 |    "source": [
111 |     "To initialize the __OpenAI client__, we need to pass the api key. There are many ways of doing it. \n",
112 |     "\n",
113 |     "####  [Option 1] Creating a .env file for storing the API key and using it # Recommended\n",
114 |     "\n",
115 |     "Install the __dotenv__ library\n",
116 |     "\n",
117 |     "_The dotenv library is a popular tool used in various programming languages, including Python and Node.js, to manage environment variables in development and deployment environments. It allows developers to load environment variables from a .env file into their application's environment._\n",
118 |     "\n",
119 |     "- Create a file named .env in the root directory of their project.\n",
120 |     "- Inside the .env file, then define environment variables in the format VARIABLE_NAME=value. \n",
121 |     "\n",
122 |     "e.g.\n",
123 |     "\n",
124 |     "OPENAI_API_KEY=YOUR API KEY"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": 2,
130 |    "metadata": {},
131 |    "outputs": [
132 |     {
133 |      "name": "stdout",
134 |      "output_type": "stream",
135 |      "text": [
136 |       "Success: .env file found with some environment variables\n"
137 |      ]
138 |     }
139 |    ],
140 |    "source": [
141 |     "from dotenv import load_dotenv\n",
142 |     "import os\n",
143 |     "\n",
144 |     "if load_dotenv():\n",
145 |     "    print(\"Success: .env file found with some environment variables\")\n",
146 |     "else:\n",
147 |     "    print(\"Caution: No environment variables found. Please create .env file in the root directory or add environment variables in the .env file\")"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "markdown",
152 |    "metadata": {},
153 |    "source": [
154 |     "#### [Option 2] Alternatively, you can set the API key in code. \n",
155 |     "However, this is not recommended since it can leave your key exposed for potential misuse. Uncomment the cell below to use this method."
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": 3,
161 |    "metadata": {},
162 |    "outputs": [],
163 |    "source": [
164 |     "#import os\n",
165 |     "# os.environ[\"OPENAI_API_KEY\"] = \"sk-proj-******\" #Imp : Replace with an OpenAI API Key"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "markdown",
170 |    "metadata": {},
171 |    "source": [
172 |     "We can also test if the key is valid or not"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": 3,
178 |    "metadata": {},
179 |    "outputs": [
180 |     {
181 |      "name": "stdout",
182 |      "output_type": "stream",
183 |      "text": [
184 |       "OPENAI_API_KEY is set and is valid\n"
185 |      ]
186 |     }
187 |    ],
188 |    "source": [
189 |     "api_key=os.environ[\"OPENAI_API_KEY\"]\n",
190 |     "\n",
191 |     "from openai import OpenAI\n",
192 |     "\n",
193 |     "client = OpenAI()\n",
194 |     "\n",
195 |     "\n",
196 |     "if api_key:\n",
197 |     "    try:\n",
198 |     "        client.models.list()\n",
199 |     "        print(\"OPENAI_API_KEY is set and is valid\")\n",
200 |     "    except openai.APIError as e:\n",
201 |     "        print(f\"OpenAI API returned an API Error: {e}\")\n",
202 |     "        pass\n",
203 |     "    except openai.APIConnectionError as e:\n",
204 |     "        print(f\"Failed to connect to OpenAI API: {e}\")\n",
205 |     "        pass\n",
206 |     "    except openai.RateLimitError as e:\n",
207 |     "        print(f\"OpenAI API request exceeded rate limit: {e}\")\n",
208 |     "        pass\n",
209 |     "\n",
210 |     "else:\n",
211 |     "    print(\"Please set you OpenAI API key as an environment variable OPENAI_API_KEY\")\n",
212 |     "\n"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "code",
217 |    "execution_count": 5,
218 |    "metadata": {},
219 |    "outputs": [],
220 |    "source": [
221 |     "# Import OpenAIEmbeddings from the library\n",
222 |     "from langchain_openai import OpenAIEmbeddings\n",
223 |     "\n",
224 |     "# Instantiate the embeddings object\n",
225 |     "embeddings=OpenAIEmbeddings(model=\"text-embedding-3-small\")\n",
226 |     "\n",
227 |     "# Import FAISS from langchain\n",
228 |     "from langchain_community.vectorstores import FAISS\n",
229 |     "\n",
230 |     "# Load the FAISS vector store with safe deserialization\n",
231 |     "vector_store = FAISS.load_local(folder_path=\"../../Assets/Data/\",index_name=\"CWC_index\", embeddings=embeddings, allow_dangerous_deserialization=True)\n"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "markdown",
236 |    "metadata": {},
237 |    "source": [
238 |     "## 2. Retrieval"
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "markdown",
243 |    "metadata": {},
244 |    "source": [
245 |     "We will now retrieve a relevant passage from the knowledge base that is pertinent to our query - __\"Who won the World Cup final?\"__"
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "markdown",
250 |    "metadata": {},
251 |    "source": [
252 |     "<img src=\"../../Assets/Images/4.2.png\" width=800>"
253 |    ]
254 |   },
255 |   {
256 |    "cell_type": "code",
257 |    "execution_count": 6,
258 |    "metadata": {},
259 |    "outputs": [
260 |     {
261 |      "name": "stdout",
262 |      "output_type": "stream",
263 |      "text": [
264 |       " Retrieved Chunk 1: The tournament was contested by ten national teams, maintaining the same format\n",
265 |       "used in  2019 . After six weeks of round-robin matches,  India ,  South Africa ,  Australia , and\n",
266 |       "New Zealand  finished as the top four and qualified for the knockout stage. In the knockout stage,\n",
267 |       "India and Australia beat New Zealand and South Africa, respectively, to advance to the final, played\n",
268 |       "on 19 November at the  Narendra Modi Stadium  in  Ahmedabad . Australia won the final by six\n",
269 |       "wickets, winning their sixth Cricket World Cup title.\n",
270 |       "\n",
271 |       "\n",
272 |       "\n",
273 |       " Retrieved Chunk 2: The host  India  was the first team to qualify for the semi-finals after their\n",
274 |       "302-run win against  Sri Lanka , their seventh successive win in the World Cup. [ 42 ]  India\n",
275 |       "secured the top place amongst the semi-finalists after they beat  South Africa  by 243 runs on 5\n",
276 |       "November at  Eden Gardens  in  Kolkata . [ 43 ]\n",
277 |       "\n",
278 |       "\n",
279 |       "\n"
280 |      ]
281 |     }
282 |    ],
283 |    "source": [
284 |     "# Define the query\n",
285 |     "query = \"Who won the world cup?\"\n",
286 |     "\n",
287 |     "# Perform similarity search\n",
288 |     "retrieved_docs = vector_store.similarity_search(query, k=2)  # Get top 2 relevant chunks\n",
289 |     "\n",
290 |     "# Display results\n",
291 |     "\n",
292 |     "import textwrap\n",
293 |     "\n",
294 |     "for i, doc in enumerate(retrieved_docs):\n",
295 |     "    print(textwrap.fill(f\"\\nRetrieved Chunk {i+1}:\\n{doc.page_content}\",width=100))\n",
296 |     "    print(\"\\n\\n\")"
297 |    ]
298 |   },
299 |   {
300 |    "cell_type": "markdown",
301 |    "metadata": {},
302 |    "source": [
303 |     "This is the most basic implementation of a retriever in the generation pipeline of a RAG-enabled system. This method of retrieval is enabled by embeddings. We used the text-embedding-3-small from OpenAI. FAISS calculated the similarity score based on these embeddings"
304 |    ]
305 |   },
306 |   {
307 |    "cell_type": "markdown",
308 |    "metadata": {},
309 |    "source": [
310 |     "## 3. Augmentation"
311 |    ]
312 |   },
313 |   {
314 |    "cell_type": "markdown",
315 |    "metadata": {},
316 |    "source": [
317 |     "The information fetched by the retriever should also be sent to the LLM in form of a natural language prompt. This process of combining the user query and the retrieved information is called augmentation."
318 |    ]
319 |   },
320 |   {
321 |    "cell_type": "markdown",
322 |    "metadata": {},
323 |    "source": [
324 |     "<img src=\"../../Assets/Images/4.3.png\" width=800>"
325 |    ]
326 |   },
327 |   {
328 |    "cell_type": "markdown",
329 |    "metadata": {},
330 |    "source": [
331 |     "We will now execute augmentation with a simple contextual prompt with controlled generation."
332 |    ]
333 |   },
334 |   {
335 |    "cell_type": "code",
336 |    "execution_count": 7,
337 |    "metadata": {},
338 |    "outputs": [
339 |     {
340 |      "name": "stdout",
341 |      "output_type": "stream",
342 |      "text": [
343 |       "  Given the context below answer the question.  Question: Who won the world cup?   Context : The tournament was contested by ten national teams,\n",
344 |       "maintaining the same format used in  2019 . After six weeks of round-robin matches,  India ,  South Africa ,  Australia , and  New Zealand  finished\n",
345 |       "as the top four and qualified for the knockout stage. In the knockout stage, India and Australia beat New Zealand and South Africa, respectively, to\n",
346 |       "advance to the final, played on 19 November at the  Narendra Modi Stadium  in  Ahmedabad . Australia won the final by six wickets, winning their sixth\n",
347 |       "Cricket World Cup title.The host  India  was the first team to qualify for the semi-finals after their 302-run win against  Sri Lanka , their seventh\n",
348 |       "successive win in the World Cup. [ 42 ]  India secured the top place amongst the semi-finalists after they beat  South Africa  by 243 runs on 5\n",
349 |       "November at  Eden Gardens  in  Kolkata . [ 43 ]  Remember to answer only based on the context provided and not from any other source.   If the\n",
350 |       "question cannot be answered based on the provided context, say I don’t know.\n"
351 |      ]
352 |     }
353 |    ],
354 |    "source": [
355 |     "# taking first two retrieved documents\n",
356 |     "retrieved_context=retrieved_docs[0].page_content + retrieved_docs[1].page_content\n",
357 |     "\n",
358 |     "# Creating the prompt\n",
359 |     "augmented_prompt=f\"\"\"\n",
360 |     "\n",
361 |     "Given the context below answer the question.\n",
362 |     "\n",
363 |     "Question: {query} \n",
364 |     "\n",
365 |     "Context : {retrieved_context}\n",
366 |     "\n",
367 |     "Remember to answer only based on the context provided and not from any other source. \n",
368 |     "\n",
369 |     "If the question cannot be answered based on the provided context, say I don’t know.\n",
370 |     "\n",
371 |     "\"\"\"\n",
372 |     "\n",
373 |     "print(textwrap.fill(augmented_prompt,width=150))"
374 |    ]
375 |   },
376 |   {
377 |    "cell_type": "markdown",
378 |    "metadata": {},
379 |    "source": [
380 |     "## 4. Generation"
381 |    ]
382 |   },
383 |   {
384 |    "cell_type": "markdown",
385 |    "metadata": {},
386 |    "source": [
387 |     "Generation is the final step of this pipeline. While LLMs may be used in any of the previous steps in the pipeline, the generation step is completely reliant on the LLM. The most popular LLMs are the ones being developed by OpenAI, Anthropic, Meta, Google, Microsoft and Mistral amongst other developers. "
388 |    ]
389 |   },
390 |   {
391 |    "cell_type": "markdown",
392 |    "metadata": {},
393 |    "source": [
394 |     "We have built a simple retriever using FAISS and OpenAI embeddings and, we created a simple augmented prompt. Now we will use OpenAI’s latest model, GPT-4o-mini, to generate the response. To do this we will import the __ChatOpenAI__ library from langchain"
395 |    ]
396 |   },
397 |   {
398 |    "cell_type": "code",
399 |    "execution_count": null,
400 |    "metadata": {},
401 |    "outputs": [
402 |     {
403 |      "name": "stdout",
404 |      "output_type": "stream",
405 |      "text": [
406 |       "Australia won the world cup.\n"
407 |      ]
408 |     }
409 |    ],
410 |    "source": [
411 |     "from langchain_openai import ChatOpenAI\n",
412 |     "\n",
413 |     "\n",
414 |     "# Set up LLM \n",
415 |     "llm = ChatOpenAI(\n",
416 |     "    model=\"gpt-4o-mini\",\n",
417 |     "    temperature=0,\n",
418 |     "    max_tokens=None,\n",
419 |     "    timeout=None,\n",
420 |     "    max_retries=2\n",
421 |     ")\n",
422 |     "\n",
423 |     "messages=[(\"human\",augmented_prompt)]\n",
424 |     "\n",
425 |     "ai_msg = llm.invoke(messages)\n",
426 |     "\n",
427 |     "\n",
428 |     "\n",
429 |     "# Extract the answer from the response object\n",
430 |     "answer=ai_msg.content\n",
431 |     "\n",
432 |     "print(answer)\n"
433 |    ]
434 |   },
435 |   {
436 |    "cell_type": "markdown",
437 |    "metadata": {},
438 |    "source": [
439 |     "# 5. RAG function"
440 |    ]
441 |   },
442 |   {
443 |    "cell_type": "markdown",
444 |    "metadata": {},
445 |    "source": [
446 |     "Let us build a function that will take three inputs - \n",
447 |     "1. User Query\n",
448 |     "2. Location of the Vector Index (Knowledge base)\n",
449 |     "3. Index Name\n",
450 |     "\n",
451 |     "And generate an answer along with the retrieved documents"
452 |    ]
453 |   },
454 |   {
455 |    "cell_type": "markdown",
456 |    "metadata": {},
457 |    "source": [
458 |     "#### RAG function"
459 |    ]
460 |   },
461 |   {
462 |    "cell_type": "code",
463 |    "execution_count": 9,
464 |    "metadata": {},
465 |    "outputs": [],
466 |    "source": [
467 |     "import re\n",
468 |     "\n",
469 |     "# Function to clean text\n",
470 |     "def clean_text(text):\n",
471 |     "    # Replace non-breaking space with regular space\n",
472 |     "    text = text.replace('\\xa0', ' ')\n",
473 |     "    \n",
474 |     "    # Remove any HTML tags (if any)\n",
475 |     "    text = re.sub(r'<[^>]+>', '', text)  # Removes HTML tags\n",
476 |     "    \n",
477 |     "    # Remove references in brackets (e.g., [7], [39])\n",
478 |     "    text = re.sub(r'\\[.*?\\]', '', text)  # Removes references inside square brackets\n",
479 |     "    \n",
480 |     "    # Remove extra spaces and newlines\n",
481 |     "    text = ' '.join(text.split())  # This will remove extra spaces and newline characters\n",
482 |     "    \n",
483 |     "    return text\n",
484 |     "\n",
485 |     "def rag_function(query, db_path, index_name):\n",
486 |     "    embeddings=OpenAIEmbeddings(model=\"text-embedding-3-small\")\n",
487 |     "\n",
488 |     "    db=FAISS.load_local(folder_path=db_path, index_name=index_name, embeddings=embeddings, allow_dangerous_deserialization=True)\n",
489 |     "\n",
490 |     "    retrieved_docs = db.similarity_search(query, k=2)\n",
491 |     "\n",
492 |     "    retrieved_context=[clean_text(retrieved_docs[0].page_content + retrieved_docs[1].page_content)]\n",
493 |     "\n",
494 |     "\n",
495 |     "    augmented_prompt=f\"\"\"\n",
496 |     "\n",
497 |     "    Given the context below answer the question.\n",
498 |     "\n",
499 |     "    Question: {query} \n",
500 |     "\n",
501 |     "    Context : {retrieved_context}\n",
502 |     "\n",
503 |     "    Remember to answer only based on the context provided and not from any other source. \n",
504 |     "\n",
505 |     "    If the question cannot be answered based on the provided context, say I don’t know.\n",
506 |     "\n",
507 |     "    \"\"\"\n",
508 |     "\n",
509 |     "    llm = ChatOpenAI(\n",
510 |     "    model=\"gpt-4o-mini\",\n",
511 |     "    temperature=0,\n",
512 |     "    max_tokens=None,\n",
513 |     "    timeout=None,\n",
514 |     "    max_retries=2\n",
515 |     "    )\n",
516 |     "\n",
517 |     "    messages=[(\"human\",augmented_prompt)]\n",
518 |     "\n",
519 |     "    ai_msg = llm.invoke(messages)\n",
520 |     "\n",
521 |     "    response=ai_msg.content\n",
522 |     "\n",
523 |     "    return retrieved_context, response\n",
524 |     "\n"
525 |    ]
526 |   },
527 |   {
528 |    "cell_type": "markdown",
529 |    "metadata": {},
530 |    "source": [
531 |     "Let's try sending our question to this function."
532 |    ]
533 |   },
534 |   {
535 |    "cell_type": "code",
536 |    "execution_count": 10,
537 |    "metadata": {},
538 |    "outputs": [
539 |     {
540 |      "data": {
541 |       "text/plain": [
542 |        "(['The tournament was contested by ten national teams, maintaining the same format used in 2019 . After six weeks of round-robin matches, India , South Africa , Australia , and New Zealand finished as the top four and qualified for the knockout stage. In the knockout stage, India and Australia beat New Zealand and South Africa, respectively, to advance to the final, played on 19 November at the Narendra Modi Stadium in Ahmedabad . Australia won the final by six wickets, winning their sixth Cricket World Cup title.The host India was the first team to qualify for the semi-finals after their 302-run win against Sri Lanka , their seventh successive win in the World Cup. India secured the top place amongst the semi-finalists after they beat South Africa by 243 runs on 5 November at Eden Gardens in Kolkata .'],\n",
543 |        " 'Australia won the world cup.')"
544 |       ]
545 |      },
546 |      "execution_count": 10,
547 |      "metadata": {},
548 |      "output_type": "execute_result"
549 |     }
550 |    ],
551 |    "source": [
552 |     "rag_function(query=\"Who won the world cup?\", db_path=\"../../Assets/Data\", index_name=\"CWC_index\")"
553 |    ]
554 |   },
555 |   {
556 |    "cell_type": "markdown",
557 |    "metadata": {},
558 |    "source": [
559 |     "Let's ask another one."
560 |    ]
561 |   },
562 |   {
563 |    "cell_type": "code",
564 |    "execution_count": 11,
565 |    "metadata": {},
566 |    "outputs": [
567 |     {
568 |      "data": {
569 |       "text/plain": [
570 |        "(['Virat Kohli was named the player of the tournament and also scored the most runs, while Mohammed Shami was the leading wicket-taker. A total of 1,250,307 spectators attended the matches, the highest number in any Cricket World Cup to date. The tournament final set viewership records in India, drawing 518 million viewers, with a peak of 57 million streaming viewers.The ICC announced its team of the tournament on 21 November 2023, with Virat Kohli being named as player of the tournament , and Rohit Sharma as captain of the team.'],\n",
571 |        " 'Virat Kohli was named the player of the tournament and scored the most runs.')"
572 |       ]
573 |      },
574 |      "execution_count": 11,
575 |      "metadata": {},
576 |      "output_type": "execute_result"
577 |     }
578 |    ],
579 |    "source": [
580 |     "rag_function(\"What was Virat Kohli's achievement in the Cup?\",db_path=\"../../Assets/Data\", index_name=\"CWC_index\")"
581 |    ]
582 |   },
583 |   {
584 |    "cell_type": "markdown",
585 |    "metadata": {},
586 |    "source": [
587 |     "We can also ask a list of questions and see what the responses are"
588 |    ]
589 |   },
590 |   {
591 |    "cell_type": "code",
592 |    "execution_count": 13,
593 |    "metadata": {},
594 |    "outputs": [],
595 |    "source": [
596 |     "list_of_queries=['What was the outcome of the match between Australia and the Netherlands on 25 October 2023?',\n",
597 |     " 'What ongoing cricket competition is currently taking place that involves multiple international teams?',\n",
598 |     " 'What was the deadline for teams to finalize their 15-player squads for the 2023 Cricket World Cup?',\n",
599 |     " \"What were the key highlights of the 2023 ICC Men's Cricket World Cup?\",\n",
600 |     " 'What were the key outcomes of the 2023 Cricket World Cup, including the final match results and notable player statistics?',\n",
601 |     " 'What years had Cricket World Cup finals and their host nations?',\n",
602 |     " \"Which org has managed the Cricket World Cup since '75?\",\n",
603 |     " \"What was India's winning margin vs. S. Africa on Nov 5, 2023?\",\n",
604 |     " 'What teams qualified for the semi-finals in the 2023 Cricket World Cup?']"
605 |    ]
606 |   },
607 |   {
608 |    "cell_type": "code",
609 |    "execution_count": 15,
610 |    "metadata": {},
611 |    "outputs": [
612 |     {
613 |      "name": "stdout",
614 |      "output_type": "stream",
615 |      "text": [
616 |       "Query:What was the outcome of the match between Australia and the Netherlands on 25 October 2023?\n",
617 |       "Response: I don’t know.\n",
618 |       "\n",
619 |       "Query:What ongoing cricket competition is currently taking place that involves multiple international teams?\n",
620 |       "Response: I don’t know.\n",
621 |       "\n",
622 |       "Query:What was the deadline for teams to finalize their 15-player squads for the 2023 Cricket World Cup?\n",
623 |       "Response: The deadline for teams to finalize their 15-player squads for the 2023 Cricket World Cup was 28 September.\n",
624 |       "\n",
625 |       "Query:What were the key highlights of the 2023 ICC Men's Cricket World Cup?\n",
626 |       "Response: The key highlights of the 2023 ICC Men's Cricket World Cup include:\n",
627 |       "\n",
628 |       "- Dates: 5 October – 19 November 2023\n",
629 |       "- Host: India (first time as the sole host)\n",
630 |       "- Format: One Day International (ODI) with a round-robin and knockout tournament structure\n",
631 |       "- Participants: 10 teams\n",
632 |       "- Matches: 48 played\n",
633 |       "- Attendance: 1,250,307 (average of 26,048 per match)\n",
634 |       "- Champions: Australia (6th title)\n",
635 |       "- Runners-up: India\n",
636 |       "- Player of the Series: Virat Kohli\n",
637 |       "- Most Runs: Virat Kohli (765 runs)\n",
638 |       "- Most Wickets: Mohammed Shami (24 wickets)\n",
639 |       "\n",
640 |       "Query:What were the key outcomes of the 2023 Cricket World Cup, including the final match results and notable player statistics?\n",
641 |       "Response: I don’t know.\n",
642 |       "\n",
643 |       "Query:What years had Cricket World Cup finals and their host nations?\n",
644 |       "Response: The years that had Cricket World Cup finals and their host nations are as follows:\n",
645 |       "\n",
646 |       "- 1975: England\n",
647 |       "- 1979: England\n",
648 |       "- 1983: England / Wales\n",
649 |       "- 1987: Australia / New Zealand\n",
650 |       "- 1992: Pakistan / India / Sri Lanka\n",
651 |       "- 1996: England / Scotland / Wales / Ireland / Netherlands\n",
652 |       "- 1999: South Africa / Zimbabwe / Kenya\n",
653 |       "- 2003: West Indies\n",
654 |       "- 2007: India / Sri Lanka / Bangladesh\n",
655 |       "- 2011: Australia / New Zealand\n",
656 |       "- 2015: England / Wales\n",
657 |       "- 2019: India\n",
658 |       "- 2023: South Africa / Zimbabwe / Namibia\n",
659 |       "\n",
660 |       "Query:Which org has managed the Cricket World Cup since '75?\n",
661 |       "Response: The organization that has managed the Cricket World Cup since 1975 is the International Cricket Council (ICC).\n",
662 |       "\n",
663 |       "Query:What was India's winning margin vs. S. Africa on Nov 5, 2023?\n",
664 |       "Response: I don’t know.\n",
665 |       "\n",
666 |       "Query:What teams qualified for the semi-finals in the 2023 Cricket World Cup?\n",
667 |       "Response: I don’t know.\n",
668 |       "\n"
669 |      ]
670 |     }
671 |    ],
672 |    "source": [
673 |     "for query in list_of_queries:\n",
674 |     "    print(f\"Query:{query}\")\n",
675 |     "    print(f\"Response: {rag_function(query,db_path=\"../../Assets/Data\", index_name=\"CWC_index\")[1]}\\n\")\n"
676 |    ]
677 |   },
678 |   {
679 |    "cell_type": "markdown",
680 |    "metadata": {},
681 |    "source": [
682 |     "For some of the questions above, the response may be \"I don't know\". That is when the LLM can't find an answer in the retrieved context. In our augmentation step, we had asked the LLM to do so."
683 |    ]
684 |   },
685 |   {
686 |    "cell_type": "markdown",
687 |    "metadata": {},
688 |    "source": [
689 |     "---"
690 |    ]
691 |   },
692 |   {
693 |    "cell_type": "markdown",
694 |    "metadata": {},
695 |    "source": [
696 |     "Is the RAG system that we have created generating the responses on the expected lines? Is the LLM still hallucinating? Before trying to improve the performance of the system we need to be able to measure and benchmark it. That is what we will do in chapter 5. We will look at the evaluation metrics and the popular benchmarks for RAG."
697 |    ]
698 |   },
699 |   {
700 |    "cell_type": "markdown",
701 |    "metadata": {},
702 |    "source": [
703 |     "---"
704 |    ]
705 |   },
706 |   {
707 |    "cell_type": "markdown",
708 |    "metadata": {},
709 |    "source": [
710 |     "<img src=\"../../Assets/Images/profile_s.png\" width=100> \n",
711 |     "\n",
712 |     "Hi! I'm Abhinav! I am an entrepreneur and Vice President of Artificial Intelligence at Yarnit. I have spent over 15 years consulting and leadership roles in data science, machine learning and AI. My current focus is in the applied Generative AI domain focussing on solving enterprise needs through contextual intelligence. I'm passionate about AI advancements constantly exploring emerging technologies to push the boundaries and create positive impacts in the world. Let’s build the future, together!\n",
713 |     "\n",
714 |     "[If you haven't already, please subscribe to the MEAP of A Simple Guide to Retrieval Augmented Generation here](https://mng.bz/8wdg)\n",
715 |     "\n",
716 |     "<a href=\"https://mng.bz/8wdg\" target=\"_blank\">\n",
717 |     "    <img src=\"../../Assets/Images/NewMEAPFooter.png\" alt=\"New MEAP\" style=\"width: 100%;\" />\n",
718 |     "</a>\n",
719 |     "\n",
720 |     "#### If you'd like to chat, I'd be very happy to connect\n",
721 |     "\n",
722 |     "[![GitHub followers](https://img.shields.io/badge/Github-000000?style=for-the-badge&logo=github&logoColor=black&color=orange)](https://github.com/abhinav-kimothi)\n",
723 |     "[![LinkedIn](https://img.shields.io/badge/LinkedIn-000000?style=for-the-badge&logo=linkedin&logoColor=orange&color=black)](https://www.linkedin.com/comm/mynetwork/discovery-see-all?usecase=PEOPLE_FOLLOWS&followMember=abhinav-kimothi)\n",
724 |     "[![Medium](https://img.shields.io/badge/Medium-000000?style=for-the-badge&logo=medium&logoColor=black&color=orange)](https://medium.com/@abhinavkimothi)\n",
725 |     "[![Insta](https://img.shields.io/badge/Instagram-000000?style=for-the-badge&logo=instagram&logoColor=orange&color=black)](https://www.instagram.com/akaiworks/)\n",
726 |     "[![Mail](https://img.shields.io/badge/email-000000?style=for-the-badge&logo=gmail&logoColor=black&color=orange)](mailto:abhinav.kimothi.ds@gmail.com)\n",
727 |     "[![X](https://img.shields.io/badge/Follow-000000?style=for-the-badge&logo=X&logoColor=orange&color=black)](https://twitter.com/abhinav_kimothi)\n",
728 |     "[![Linktree](https://img.shields.io/badge/Linktree-000000?style=for-the-badge&logo=linktree&logoColor=black&color=orange)](https://linktr.ee/abhinavkimothi)\n",
729 |     "[![Gumroad](https://img.shields.io/badge/Gumroad-000000?style=for-the-badge&logo=gumroad&logoColor=orange&color=black)](https://abhinavkimothi.gumroad.com/)\n",
730 |     "\n",
731 |     "---"
732 |    ]
733 |   }
734 |  ],
735 |  "metadata": {
736 |   "kernelspec": {
737 |    "display_name": ".sgragch4",
738 |    "language": "python",
739 |    "name": "python3"
740 |   },
741 |   "language_info": {
742 |    "codemirror_mode": {
743 |     "name": "ipython",
744 |     "version": 3
745 |    },
746 |    "file_extension": ".py",
747 |    "mimetype": "text/x-python",
748 |    "name": "python",
749 |    "nbconvert_exporter": "python",
750 |    "pygments_lexer": "ipython3",
751 |    "version": "3.13.2"
752 |   }
753 |  },
754 |  "nbformat": 4,
755 |  "nbformat_minor": 2
756 | }
757 | 


--------------------------------------------------------------------------------
/Chapters/Chapter-04/xtra_tfidf_bm25_retriever.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "<a href=\"https://mng.bz/8wdg\" target=\"_blank\">\n",
  8 |     "    <img src=\"../../Assets/Images/NewMEAPHeader.png\" alt=\"New MEAP\" style=\"width: 100%;\" />\n",
  9 |     "</a>\n"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "# Chapter 04 [Additional] - TFIDF & BM25 Retrievers"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "## Installing Dependencies\n",
 24 |     "\n",
 25 |     "All the necessary libraries for running this notebook along with their versions can be found in __requirements.txt__ file in the root directory of this repository\n",
 26 |     "\n",
 27 |     "You should go to the root directory and run the following command to install the libraries\n",
 28 |     "\n",
 29 |     "```\n",
 30 |     "pip install -r requirements.txt\n",
 31 |     "```\n",
 32 |     "\n",
 33 |     "This is the recommended method of installing the dependencies\n",
 34 |     "\n",
 35 |     "___\n",
 36 |     "Alternatively, you can run the command from this notebook too. The relative path may vary"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 1,
 42 |    "metadata": {},
 43 |    "outputs": [
 44 |     {
 45 |      "name": "stdout",
 46 |      "output_type": "stream",
 47 |      "text": [
 48 |       "Note: you may need to restart the kernel to use updated packages.\n"
 49 |      ]
 50 |     }
 51 |    ],
 52 |    "source": [
 53 |     "%pip install -r ../../requirements.txt --quiet"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": 4,
 59 |    "metadata": {},
 60 |    "outputs": [
 61 |     {
 62 |      "name": "stdout",
 63 |      "output_type": "stream",
 64 |      "text": [
 65 |       "Note: you may need to restart the kernel to use updated packages.\n",
 66 |       "Note: you may need to restart the kernel to use updated packages.\n",
 67 |       "Note: you may need to restart the kernel to use updated packages.\n",
 68 |       "Note: you may need to restart the kernel to use updated packages.\n",
 69 |       "Note: you may need to restart the kernel to use updated packages.\n"
 70 |      ]
 71 |     }
 72 |    ],
 73 |    "source": [
 74 |     "%pip install --upgrade pip --quiet\n",
 75 |     "%pip install langchain==0.2.11 --quiet\n",
 76 |     "%pip install langchain-community==0.2.10 --quiet\n",
 77 |     "%pip install scikit-learn==1.4.2 --quiet\n",
 78 |     "%pip install rank_bm25==0.2.2 --quiet"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "markdown",
 83 |    "metadata": {},
 84 |    "source": [
 85 |     "## TF-IDF"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "markdown",
 90 |    "metadata": {},
 91 |    "source": [
 92 |     "TF-IDF is a statistical measure used to evaluate the importance of a word in a document relative to a collection of documents (corpus). It assigns higher weights to words that appear frequently in a document but infrequently across the corpus"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "markdown",
 97 |    "metadata": {},
 98 |    "source": [
 99 |     "<img src=\"../../Assets/Images/4.+1.png\" width=800>"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": 3,
105 |    "metadata": {},
106 |    "outputs": [
107 |     {
108 |      "name": "stdout",
109 |      "output_type": "stream",
110 |      "text": [
111 |       "Australia won the sixth time having last won in 2015\n"
112 |      ]
113 |     }
114 |    ],
115 |    "source": [
116 |     "# Import TFIDFRetriever class from retrievers library\n",
117 |     "from langchain_community.retrievers import TFIDFRetriever\n",
118 |     "\n",
119 |     "# Create instance of the TFIDFRetriever with texts\n",
120 |     "retriever = TFIDFRetriever.from_texts(\n",
121 |     "[\"Australia won the Cricket World Cup 2023\",\n",
122 |     " \"India and Australia played in the finals\",\n",
123 |     " \"Australia won the sixth time having last won in 2015\"]\n",
124 |     ")\n",
125 |     "\n",
126 |     "# Use the retriever using the invoke method\n",
127 |     "result=retriever.invoke(\"won\")\n",
128 |     "\n",
129 |     "# Print the results\n",
130 |     "print(result[0].page_content)\n"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "markdown",
135 |    "metadata": {},
136 |    "source": [
137 |     "## BM25 "
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "markdown",
142 |    "metadata": {},
143 |    "source": [
144 |     "BM25 is an advanced probabilistic model used to rank documents based on the query terms appearing in each document. It is part of the family of probabilistic information retrieval models and is considered an advancement over the classic TF-IDF model. The improvement that BM25 brings is that it adjusts for the length of the documents so that longer documents do not unfairly get higher scores. "
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "markdown",
149 |    "metadata": {},
150 |    "source": [
151 |     "<img src=\"../../Assets/Images/4.+2.png\" width=800>"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": 5,
157 |    "metadata": {},
158 |    "outputs": [
159 |     {
160 |      "name": "stdout",
161 |      "output_type": "stream",
162 |      "text": [
163 |       "Australia won the Cricket World Cup 2023\n"
164 |      ]
165 |     }
166 |    ],
167 |    "source": [
168 |     "# Import BM25Retriever class from retrievers library\n",
169 |     "from langchain_community.retrievers import BM25Retriever\n",
170 |     "\n",
171 |     "# Create instance of the TFIDFRetriever with texts\n",
172 |     "retriever = BM25Retriever.from_texts(\n",
173 |     "[\"Australia won the Cricket World Cup 2023\",\n",
174 |     " \"India and Australia played in the finals\",\n",
175 |     " \"Australia won the sixth time having last won in 2015\"]\n",
176 |     ")\n",
177 |     "\n",
178 |     "# Use the retriever using the invoke method\n",
179 |     "result=retriever.invoke(\"Who won the 2023 Cricket World Cup?\")\n",
180 |     "\n",
181 |     "# Print the results\n",
182 |     "print(result[0].page_content)"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "markdown",
187 |    "metadata": {},
188 |    "source": [
189 |     "---"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "markdown",
194 |    "metadata": {},
195 |    "source": [
196 |     "<img src=\"../../Assets/Images/profile_s.png\" width=100> \n",
197 |     "\n",
198 |     "Hi! I'm Abhinav! I am an entrepreneur and Vice President of Artificial Intelligence at Yarnit. I have spent over 15 years consulting and leadership roles in data science, machine learning and AI. My current focus is in the applied Generative AI domain focussing on solving enterprise needs through contextual intelligence. I'm passionate about AI advancements constantly exploring emerging technologies to push the boundaries and create positive impacts in the world. Let’s build the future, together!\n",
199 |     "\n",
200 |     "[If you haven't already, please subscribe to the MEAP of A Simple Guide to Retrieval Augmented Generation here](https://mng.bz/8wdg)\n",
201 |     "\n",
202 |     "<a href=\"https://mng.bz/8wdg\" target=\"_blank\">\n",
203 |     "    <img src=\"../../Assets/Images/NewMEAPFooter.png\" alt=\"New MEAP\" style=\"width: 100%;\" />\n",
204 |     "</a>\n",
205 |     "\n",
206 |     "#### If you'd like to chat, I'd be very happy to connect\n",
207 |     "\n",
208 |     "[![GitHub followers](https://img.shields.io/badge/Github-000000?style=for-the-badge&logo=github&logoColor=black&color=orange)](https://github.com/abhinav-kimothi)\n",
209 |     "[![LinkedIn](https://img.shields.io/badge/LinkedIn-000000?style=for-the-badge&logo=linkedin&logoColor=orange&color=black)](https://www.linkedin.com/comm/mynetwork/discovery-see-all?usecase=PEOPLE_FOLLOWS&followMember=abhinav-kimothi)\n",
210 |     "[![Medium](https://img.shields.io/badge/Medium-000000?style=for-the-badge&logo=medium&logoColor=black&color=orange)](https://medium.com/@abhinavkimothi)\n",
211 |     "[![Insta](https://img.shields.io/badge/Instagram-000000?style=for-the-badge&logo=instagram&logoColor=orange&color=black)](https://www.instagram.com/akaiworks/)\n",
212 |     "[![Mail](https://img.shields.io/badge/email-000000?style=for-the-badge&logo=gmail&logoColor=black&color=orange)](mailto:abhinav.kimothi.ds@gmail.com)\n",
213 |     "[![X](https://img.shields.io/badge/Follow-000000?style=for-the-badge&logo=X&logoColor=orange&color=black)](https://twitter.com/abhinav_kimothi)\n",
214 |     "[![Linktree](https://img.shields.io/badge/Linktree-000000?style=for-the-badge&logo=linktree&logoColor=black&color=orange)](https://linktr.ee/abhinavkimothi)\n",
215 |     "[![Gumroad](https://img.shields.io/badge/Gumroad-000000?style=for-the-badge&logo=gumroad&logoColor=orange&color=black)](https://abhinavkimothi.gumroad.com/)\n",
216 |     "\n",
217 |     "---"
218 |    ]
219 |   }
220 |  ],
221 |  "metadata": {
222 |   "kernelspec": {
223 |    "display_name": ".envch4ex",
224 |    "language": "python",
225 |    "name": "python3"
226 |   },
227 |   "language_info": {
228 |    "codemirror_mode": {
229 |     "name": "ipython",
230 |     "version": 3
231 |    },
232 |    "file_extension": ".py",
233 |    "mimetype": "text/x-python",
234 |    "name": "python",
235 |    "nbconvert_exporter": "python",
236 |    "pygments_lexer": "ipython3",
237 |    "version": "3.13.2"
238 |   }
239 |  },
240 |  "nbformat": 4,
241 |  "nbformat_minor": 2
242 | }
243 | 


--------------------------------------------------------------------------------
/Chapters/Chapter-05/evaluators.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional
  2 | 
  3 | from langchain.evaluation import load_evaluator
  4 | from langchain.smith import RunEvalConfig
  5 | from langchain_openai import ChatOpenAI
  6 | 
  7 | try:
  8 |     from langchain.schema.language_model import BaseLanguageModel
  9 | except ImportError:
 10 |     from langchain_core.language_models import BaseLanguageModel
 11 | from langsmith.evaluation.evaluator import EvaluationResult, RunEvaluator
 12 | from langsmith.schemas import Example, Run
 13 | 
 14 | 
 15 | # TODO: Split this into an assertion-by-assertion evaluator
 16 | # TODO: Combine with a document relevance evaluator (to report retriever performance)
 17 | class FaithfulnessEvaluator(RunEvaluator):
 18 |     def __init__(self, llm: Optional[BaseLanguageModel] = None):
 19 |         self.evaluator = load_evaluator(
 20 |             "labeled_score_string",
 21 |             criteria={
 22 |                 "faithfulness": """
 23 | Score 1: The answer directly contradicts the information provided in the reference docs.
 24 | Score 3: The answer contains a mix of correct information from the reference docs and incorrect or unverifiable information not found in the docs.
 25 | Score 5: The answer is mostly aligned with the reference docs but includes extra information that, while not contradictory, is not verified by the docs.
 26 | Score 7: The answer aligns well with the reference docs but includes minor, commonly accepted facts not found in the docs.
 27 | Score 10: The answer perfectly aligns with and is fully entailed by the reference docs, with no extra information."""
 28 |             },
 29 |             llm=llm,
 30 |             normalize_by=10,
 31 |         )
 32 | 
 33 |     @staticmethod
 34 |     def _get_retrieved_docs(run: Run) -> str:
 35 |         # This assumes there is only one retriever in your chain.
 36 |         # To select more precisely, name your retrieval chain
 37 |         # using with_config(name="my_unique_name") and look up
 38 |         # by run.name
 39 |         runs = [run]
 40 |         while runs:
 41 |             run = runs.pop()
 42 |             if run.run_type == "retriever":
 43 |                 return str(run.outputs["documents"])
 44 |             if run.child_runs:
 45 |                 runs.extend(run.child_runs[::-1])
 46 |         return ""
 47 | 
 48 |     def evaluate_run(
 49 |         self, run: Run, example: Optional[Example] = None
 50 |     ) -> EvaluationResult:
 51 |         try:
 52 |             docs_string = self._get_retrieved_docs(run)
 53 |             docs_string = f"Reference docs:\n<DOCS>\n{docs_string}\n</DOCS>\n\n"
 54 |             print(f"\n{docs_string[10]}\n")
 55 |             input_query = run.inputs["Question"]
 56 |             print(f"\nInput Query={input_query}\n")
 57 |             if run.outputs is not None and len(run.outputs) == 1:
 58 |                 prediction = next(iter(run.outputs.values()))
 59 |                 print(f"\nPrediction={prediction}\n")
 60 |             else:
 61 |                 prediction = run.outputs["output"]
 62 |                 print(f"\nPrediction={prediction}\n")
 63 |             result = self.evaluator.evaluate_strings(
 64 |                 input=input_query,
 65 |                 prediction=prediction,
 66 |                 reference=docs_string,
 67 |             )
 68 |             return EvaluationResult(
 69 |                 **{"key": "faithfulness", "comment": result.get("reasoning"), **result}
 70 |             )
 71 |         except Exception as e:
 72 |             return EvaluationResult(key="faithfulness", score=None, comment=repr(e))
 73 | 
 74 | 
 75 | _ACCURACY_CRITERION = {
 76 |     "accuracy": """
 77 | Score 1: The answer is incorrect and unrelated to the question or reference document.
 78 | Score 3: The answer shows slight relevance to the question or reference document but is largely incorrect.
 79 | Score 5: The answer is partially correct but has significant errors or omissions.
 80 | Score 7: The answer is mostly correct with minor errors or omissions, and aligns with the reference document.
 81 | Score 10: The answer is correct, complete, and perfectly aligns with the reference document.
 82 | 
 83 | If the reference answer contains multiple alternatives, the predicted answer must only match one of the alternatives to be considered correct.
 84 | If the predicted answer contains additional helpful and accurate information that is not present in the reference answer, it should still be considered correct.
 85 | """  # noqa
 86 | }
 87 | 
 88 | 
 89 | def get_eval_config() -> RunEvalConfig:
 90 |     """Returns the evaluator for the environment."""
 91 |     eval_llm = ChatOpenAI(
 92 |         model="gpt-4o-mini",
 93 |         temperature=0.0,
 94 |         seed=42,
 95 |         max_retries=1,
 96 |         request_timeout=60,
 97 |     )
 98 |     # Use a longer-context LLM to check documents
 99 |     faithfulness_eval_llm = ChatOpenAI(
100 |         model="gpt-4o-mini",
101 |         temperature=0.0,
102 |         seed=42,
103 |         max_retries=1,
104 |         request_timeout=60,
105 |     )
106 | 
107 |     return RunEvalConfig(
108 |         evaluators=[
109 |             RunEvalConfig.LabeledScoreString(
110 |                 criteria=_ACCURACY_CRITERION, llm=eval_llm, normalize_by=10.0
111 |             ),
112 |             RunEvalConfig.EmbeddingDistance(),
113 |         ],
114 |         custom_evaluators=[FaithfulnessEvaluator(llm=faithfulness_eval_llm)],
115 |     )
116 | 


--------------------------------------------------------------------------------
/Chapters/Chapter-05/rag_evaluations.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "<a href=\"https://mng.bz/8wdg\" target=\"_blank\">\n",
  8 |     "    <img src=\"../../Assets/Images/NewMEAPHeader.png\" alt=\"New MEAP\" style=\"width: 100%;\" />\n",
  9 |     "</a>\n"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "# Chapter 05 - RAG Evaluation: Accuracy, Relevance, Faithfulness"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "### Welcome to chapter 5 of A Simple Introduction to Retrieval Augmented Generation.\n",
 24 |     "\n",
 25 |     "In this chapter, we will assess the quality of the RAG pipeline we have built in Chapter 3 & 4. We will re-use the [knowledge base](../../Assets/Data/) we created with the Wikipedia article. We will reuse the Retrieval Augmentation and Generation functions we built in Chapter 4."
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "markdown",
 30 |    "metadata": {},
 31 |    "source": [
 32 |     "## Installing Dependencies"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "markdown",
 37 |    "metadata": {},
 38 |    "source": [
 39 |     "All the necessary libraries for running this notebook along with their versions can be found in __requirements.txt__ file in the root directory of this repository\n",
 40 |     "\n",
 41 |     "You should go to the root directory and run the following command to install the libraries\n",
 42 |     "\n",
 43 |     "```\n",
 44 |     "pip install -r requirements.txt\n",
 45 |     "```\n",
 46 |     "\n",
 47 |     "This is the recommended method of installing the dependencies\n",
 48 |     "\n",
 49 |     "___\n",
 50 |     "Alternatively, you can run the command from this notebook too. The relative path may vary"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 1,
 56 |    "metadata": {},
 57 |    "outputs": [
 58 |     {
 59 |      "name": "stdout",
 60 |      "output_type": "stream",
 61 |      "text": [
 62 |       "\n",
 63 |       "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m25.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.0.1\u001b[0m\n",
 64 |       "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
 65 |       "Note: you may need to restart the kernel to use updated packages.\n"
 66 |      ]
 67 |     }
 68 |    ],
 69 |    "source": [
 70 |     "%pip install -r ../../requriements.txt --quiet"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "markdown",
 75 |    "metadata": {},
 76 |    "source": [
 77 |     "## 1. Re-Load the RAG Pipeline"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "markdown",
 82 |    "metadata": {},
 83 |    "source": [
 84 |     "In chapter 4, we created the generation pipeline. We will bring that here to use it for evaluations.\n",
 85 |     "\n",
 86 |     "In Chapter 3, we were working on indexing the Wikipedia page for the 2023 cricket world cup. If you recall we had used embeddings from OpenAI to encode the text and used FAISS as the vector index to store the embeddings. We also stored the FAISS index in a local directory. We will use this in the RAG pipeline."
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "markdown",
 91 |    "metadata": {},
 92 |    "source": [
 93 |     "Note: You will need an __OpenAI API Key__ which can be obtained from [OpenAI](https://platform.openai.com/api-keys) to reuse the embeddings."
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "markdown",
 98 |    "metadata": {},
 99 |    "source": [
100 |     "To initialize the __OpenAI client__, we need to pass the api key. There are many ways of doing it. \n",
101 |     "\n",
102 |     "####  [Option 1] Creating a .env file for storing the API key and using it # Recommended\n",
103 |     "\n",
104 |     "Install the __dotenv__ library\n",
105 |     "\n",
106 |     "_The dotenv library is a popular tool used in various programming languages, including Python and Node.js, to manage environment variables in development and deployment environments. It allows developers to load environment variables from a .env file into their application's environment._\n",
107 |     "\n",
108 |     "- Create a file named .env in the root directory of their project.\n",
109 |     "- Inside the .env file, then define environment variables in the format VARIABLE_NAME=value. \n",
110 |     "\n",
111 |     "e.g.\n",
112 |     "\n",
113 |     "OPENAI_API_KEY=YOUR API KEY"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": 2,
119 |    "metadata": {},
120 |    "outputs": [
121 |     {
122 |      "name": "stdout",
123 |      "output_type": "stream",
124 |      "text": [
125 |       "Success: .env file found with some environment variables\n"
126 |      ]
127 |     }
128 |    ],
129 |    "source": [
130 |     "from dotenv import load_dotenv\n",
131 |     "import os\n",
132 |     "\n",
133 |     "if load_dotenv():\n",
134 |     "    print(\"Success: .env file found with some environment variables\")\n",
135 |     "else:\n",
136 |     "    print(\"Caution: No environment variables found. Please create .env file in the root directory or add environment variables in the .env file\")"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "markdown",
141 |    "metadata": {},
142 |    "source": [
143 |     "#### [Option 2] Alternatively, you can set the API key in code. \n",
144 |     "However, this is not recommended since it can leave your key exposed for potential misuse. Uncomment the cell below to use this method."
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": 3,
150 |    "metadata": {},
151 |    "outputs": [],
152 |    "source": [
153 |     "#import os\n",
154 |     "# os.environ[\"OPENAI_API_KEY\"] = \"sk-proj-******\" #Imp : Replace with an OpenAI API Key"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "markdown",
159 |    "metadata": {},
160 |    "source": [
161 |     "We can also test if the key is valid or not"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": 3,
167 |    "metadata": {},
168 |    "outputs": [
169 |     {
170 |      "name": "stdout",
171 |      "output_type": "stream",
172 |      "text": [
173 |       "OPENAI_API_KEY is set and is valid\n"
174 |      ]
175 |     }
176 |    ],
177 |    "source": [
178 |     "api_key=os.environ[\"OPENAI_API_KEY\"]\n",
179 |     "\n",
180 |     "import openai\n",
181 |     "from openai import OpenAI\n",
182 |     "\n",
183 |     "client = OpenAI()\n",
184 |     "\n",
185 |     "\n",
186 |     "if api_key:\n",
187 |     "    try:\n",
188 |     "        client.models.list()\n",
189 |     "        print(\"OPENAI_API_KEY is set and is valid\")\n",
190 |     "    except openai.APIError as e:\n",
191 |     "        print(f\"OpenAI API returned an API Error: {e}\")\n",
192 |     "        pass\n",
193 |     "    except openai.APIConnectionError as e:\n",
194 |     "        print(f\"Failed to connect to OpenAI API: {e}\")\n",
195 |     "        pass\n",
196 |     "    except openai.RateLimitError as e:\n",
197 |     "        print(f\"OpenAI API request exceeded rate limit: {e}\")\n",
198 |     "        pass\n",
199 |     "\n",
200 |     "else:\n",
201 |     "    print(\"Please set you OpenAI API key as an environment variable OPENAI_API_KEY\")\n",
202 |     "\n"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "markdown",
207 |    "metadata": {},
208 |    "source": [
209 |     "The RAG pipeline takes three inputs - \n",
210 |     "1. User Query\n",
211 |     "2. Location of the Vector Index (Knowledge base)\n",
212 |     "3. Index Name\n",
213 |     "\n",
214 |     "And generate an answer along with the retrieved documents\n"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "markdown",
219 |    "metadata": {},
220 |    "source": [
221 |     "#### RAG function"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "code",
226 |    "execution_count": 4,
227 |    "metadata": {},
228 |    "outputs": [],
229 |    "source": [
230 |     "import re\n",
231 |     "from langchain_openai import OpenAIEmbeddings, ChatOpenAI\n",
232 |     "from langchain_community.vectorstores import FAISS\n",
233 |     "\n",
234 |     "# Function to clean text\n",
235 |     "def clean_text(text):\n",
236 |     "    # Replace non-breaking space with regular space\n",
237 |     "    text = text.replace('\\xa0', ' ')\n",
238 |     "    \n",
239 |     "    # Remove any HTML tags (if any)\n",
240 |     "    text = re.sub(r'<[^>]+>', '', text)  # Removes HTML tags\n",
241 |     "    \n",
242 |     "    # Remove references in brackets (e.g., [7], [39])\n",
243 |     "    text = re.sub(r'\\[.*?\\]', '', text)  # Removes references inside square brackets\n",
244 |     "    \n",
245 |     "    # Remove extra spaces and newlines\n",
246 |     "    text = ' '.join(text.split())  # This will remove extra spaces and newline characters\n",
247 |     "    \n",
248 |     "    return text\n",
249 |     "\n",
250 |     "def rag_function(query, db_path, index_name):\n",
251 |     "    embeddings=OpenAIEmbeddings(model=\"text-embedding-3-small\")\n",
252 |     "\n",
253 |     "    db=FAISS.load_local(folder_path=db_path, index_name=index_name, embeddings=embeddings, allow_dangerous_deserialization=True)\n",
254 |     "\n",
255 |     "    retrieved_docs = db.similarity_search(query, k=2)\n",
256 |     "\n",
257 |     "    retrieved_context=[clean_text(retrieved_docs[0].page_content + retrieved_docs[1].page_content)]\n",
258 |     "\n",
259 |     "\n",
260 |     "    augmented_prompt=f\"\"\"\n",
261 |     "\n",
262 |     "    Given the context below answer the question.\n",
263 |     "\n",
264 |     "    Question: {query} \n",
265 |     "\n",
266 |     "    Context : {retrieved_context}\n",
267 |     "\n",
268 |     "    Remember to answer only based on the context provided and not from any other source. \n",
269 |     "\n",
270 |     "    If the question cannot be answered based on the provided context, say I don’t know.\n",
271 |     "\n",
272 |     "    \"\"\"\n",
273 |     "\n",
274 |     "    llm = ChatOpenAI(\n",
275 |     "    model=\"gpt-4o-mini\",\n",
276 |     "    temperature=0,\n",
277 |     "    max_tokens=None,\n",
278 |     "    timeout=None,\n",
279 |     "    max_retries=2\n",
280 |     "    )\n",
281 |     "\n",
282 |     "    messages=[(\"human\",augmented_prompt)]\n",
283 |     "\n",
284 |     "    ai_msg = llm.invoke(messages)\n",
285 |     "\n",
286 |     "    response=ai_msg.content\n",
287 |     "\n",
288 |     "    return retrieved_context, response\n",
289 |     "\n"
290 |    ]
291 |   },
292 |   {
293 |    "cell_type": "markdown",
294 |    "metadata": {},
295 |    "source": [
296 |     "Let's try sending our question to this function."
297 |    ]
298 |   },
299 |   {
300 |    "cell_type": "code",
301 |    "execution_count": 5,
302 |    "metadata": {},
303 |    "outputs": [
304 |     {
305 |      "data": {
306 |       "text/plain": [
307 |        "(['The tournament was contested by ten national teams, maintaining the same format used in 2019 . After six weeks of round-robin matches, India , South Africa , Australia , and New Zealand finished as the top four and qualified for the knockout stage. In the knockout stage, India and Australia beat New Zealand and South Africa, respectively, to advance to the final, played on 19 November at the Narendra Modi Stadium in Ahmedabad . Australia won the final by six wickets, winning their sixth Cricket World Cup title.The host India was the first team to qualify for the semi-finals after their 302-run win against Sri Lanka , their seventh successive win in the World Cup. India secured the top place amongst the semi-finalists after they beat South Africa by 243 runs on 5 November at Eden Gardens in Kolkata .'],\n",
308 |        " 'Australia won the world cup.')"
309 |       ]
310 |      },
311 |      "execution_count": 5,
312 |      "metadata": {},
313 |      "output_type": "execute_result"
314 |     }
315 |    ],
316 |    "source": [
317 |     "rag_function(query=\"Who won the world cup?\", db_path=\"../../Assets/Data\", index_name=\"CWC_index\")"
318 |    ]
319 |   },
320 |   {
321 |    "cell_type": "markdown",
322 |    "metadata": {},
323 |    "source": [
324 |     "Let's ask another one."
325 |    ]
326 |   },
327 |   {
328 |    "cell_type": "code",
329 |    "execution_count": 6,
330 |    "metadata": {},
331 |    "outputs": [
332 |     {
333 |      "data": {
334 |       "text/plain": [
335 |        "(['Virat Kohli was named the player of the tournament and also scored the most runs, while Mohammed Shami was the leading wicket-taker. A total of 1,250,307 spectators attended the matches, the highest number in any Cricket World Cup to date. The tournament final set viewership records in India, drawing 518 million viewers, with a peak of 57 million streaming viewers.The ICC announced its team of the tournament on 21 November 2023, with Virat Kohli being named as player of the tournament , and Rohit Sharma as captain of the team.'],\n",
336 |        " 'Virat Kohli was named the player of the tournament and scored the most runs.')"
337 |       ]
338 |      },
339 |      "execution_count": 6,
340 |      "metadata": {},
341 |      "output_type": "execute_result"
342 |     }
343 |    ],
344 |    "source": [
345 |     "rag_function(query=\"What was Virat Kohli's achievement in the Cup?\",db_path=\"../../Assets/Data\", index_name=\"CWC_index\")"
346 |    ]
347 |   },
348 |   {
349 |    "cell_type": "markdown",
350 |    "metadata": {},
351 |    "source": [
352 |     "We can also try asking a question which is out of the scope of our knowledge base"
353 |    ]
354 |   },
355 |   {
356 |    "cell_type": "code",
357 |    "execution_count": 7,
358 |    "metadata": {},
359 |    "outputs": [
360 |     {
361 |      "data": {
362 |       "text/plain": [
363 |        "(['(RLQ=window.RLQ||).push(function(){mw.config.set({\"wgHostname\":\"mw-web.codfw.main-85db9df4c9-86vj4\",\"wgBackendResponseTime\":174,\"wgPageParseReport\":{\"limitreport\":{\"cputime\":\"2.102\",\"walltime\":\"2.387\",\"ppvisitednodes\":{\"value\":29880,\"limit\":1000000},\"postexpandincludesize\":{\"value\":547658,\"limit\":2097152},\"templateargumentsize\":{\"value\":113569,\"limit\":2097152},\"expansiondepth\":{\"value\":13,\"limit\":100},\"expensivefunctioncount\":{\"value\":22,\"limit\":500},\"unstrip-depth\":{\"value\":1,\"limit\":20},\"unstrip-size\":{\"value\":312186,\"limit\":5000000},\"entityaccesscount\":{\"value\":1,\"limit\":400},\"timingprofile\":[\"100.00% 1812.691 1 -total\",\" 22.76% 412.523 1 Template:Reflist\",\" 14.91% 270.321 37 Template:Cite_web\",\" 11.46% 207.704 58 Template:Single-innings_cricket_match\",\" 11.12% 201.536 1 Template:2023_CWC_and_2025_ICC_CT_sidebar\",\" 10.94% 198.332 1 Template:Sidebar_with_collapsible_lists\",\" 7.79% 141.132 96 Template:Cr\",\" 7.15% 129Background Host selection'],\n",
364 |        " 'I don’t know.')"
365 |       ]
366 |      },
367 |      "execution_count": 7,
368 |      "metadata": {},
369 |      "output_type": "execute_result"
370 |     }
371 |    ],
372 |    "source": [
373 |     "rag_function(query=\"What RAG?\",db_path=\"../../Assets/Data\", index_name=\"CWC_index\")"
374 |    ]
375 |   },
376 |   {
377 |    "cell_type": "markdown",
378 |    "metadata": {},
379 |    "source": [
380 |     "For some of the questions, the response may be \"I don't know\". That is when the LLM can't find an answer in the retrieved context. In our augmentation step, we had asked the LLM to do so. But how good is this system? We need to be able to evaluate it."
381 |    ]
382 |   },
383 |   {
384 |    "cell_type": "markdown",
385 |    "metadata": {},
386 |    "source": [
387 |     "## 2. RAGAs Framework"
388 |    ]
389 |   },
390 |   {
391 |    "cell_type": "markdown",
392 |    "metadata": {},
393 |    "source": [
394 |     "[Ragas](https://docs.ragas.io/en/stable/) is a framework that helps you evaluate your Retrieval Augmented Generation (RAG) pipelines. It has been developed by the good folks at [exploding gradients](https://github.com/explodinggradients)."
395 |    ]
396 |   },
397 |   {
398 |    "cell_type": "markdown",
399 |    "metadata": {},
400 |    "source": [
401 |     "We will look at this evaluation in 2 parts. \n",
402 |     "\n",
403 |     "1. Creation of synthetic test data for evaluation.\n",
404 |     "2. Calculation of evaluation metrics."
405 |    ]
406 |   },
407 |   {
408 |    "cell_type": "markdown",
409 |    "metadata": {},
410 |    "source": [
411 |     "### 2.1 Creation of Synthetic Data"
412 |    ]
413 |   },
414 |   {
415 |    "cell_type": "markdown",
416 |    "metadata": {},
417 |    "source": [
418 |     "Synthetic Data Generation uses LLMs to generate diverse questions and answers from the documents in the knowledge base. LLMs can be prompted to create questions like simple questions, multi-context questions, conditional questions, reasoning questions etc. using the documents from the knowledge base as context.\n",
419 |     "\n",
420 |     "<img src=\"../../Assets/Images/5.1.png\" width=70%>"
421 |    ]
422 |   },
423 |   {
424 |    "cell_type": "code",
425 |    "execution_count": 8,
426 |    "metadata": {},
427 |    "outputs": [
428 |     {
429 |      "name": "stderr",
430 |      "output_type": "stream",
431 |      "text": [
432 |       "USER_AGENT environment variable not set, consider setting it to identify your requests.\n",
433 |       "Fetching pages: 100%|##########| 1/1 [00:00<00:00,  3.25it/s]\n"
434 |      ]
435 |     }
436 |    ],
437 |    "source": [
438 |     "from langchain_community.document_loaders import AsyncHtmlLoader\n",
439 |     "\n",
440 |     "#This is the url of the wikipedia page on the 2023 Cricket World Cup\n",
441 |     "url=\"https://en.wikipedia.org/wiki/2023_Cricket_World_Cup\"\n",
442 |     "\n",
443 |     "#Instantiating the AsyncHtmlLoader\n",
444 |     "loader = AsyncHtmlLoader (url)\n",
445 |     "\n",
446 |     "#Loading the extracted information\n",
447 |     "html_data = loader.load()\n",
448 |     "\n",
449 |     "from langchain_community.document_transformers import Html2TextTransformer\n",
450 |     "\n",
451 |     "#Instantiate the Html2TextTransformer function\n",
452 |     "html2text = Html2TextTransformer()\n",
453 |     "\n",
454 |     "\n",
455 |     "#Call transform_documents\n",
456 |     "html_data_transformed = html2text.transform_documents(html_data)"
457 |    ]
458 |   },
459 |   {
460 |    "cell_type": "code",
461 |    "execution_count": 13,
462 |    "metadata": {},
463 |    "outputs": [],
464 |    "source": [
465 |     "from ragas.llms import LangchainLLMWrapper\n",
466 |     "from ragas.embeddings import LangchainEmbeddingsWrapper\n",
467 |     "\n",
468 |     "\n",
469 |     "generator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o-mini\"))\n",
470 |     "generator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings(model=\"text-embedding-3-small\"))"
471 |    ]
472 |   },
473 |   {
474 |    "cell_type": "code",
475 |    "execution_count": 15,
476 |    "metadata": {},
477 |    "outputs": [
478 |     {
479 |      "name": "stderr",
480 |      "output_type": "stream",
481 |      "text": [
482 |       "Generating personas: 100%|██████████| 1/1 [00:01<00:00,  1.06s/it]                                           \n",
483 |       "Generating Scenarios: 100%|██████████| 2/2 [00:07<00:00,  3.67s/it]\n",
484 |       "Generating Samples: 100%|██████████| 10/10 [00:02<00:00,  3.40it/s]\n"
485 |      ]
486 |     }
487 |    ],
488 |    "source": [
489 |     "from ragas.testset import TestsetGenerator\n",
490 |     "\n",
491 |     "generator = TestsetGenerator(llm=generator_llm, embedding_model=generator_embeddings)\n",
492 |     "dataset = generator.generate_with_langchain_docs(html_data_transformed, testset_size=10)"
493 |    ]
494 |   },
495 |   {
496 |    "cell_type": "code",
497 |    "execution_count": 16,
498 |    "metadata": {},
499 |    "outputs": [],
500 |    "source": [
501 |     "sample_queries = dataset.to_pandas()['user_input'].to_list()"
502 |    ]
503 |   },
504 |   {
505 |    "cell_type": "code",
506 |    "execution_count": 17,
507 |    "metadata": {},
508 |    "outputs": [],
509 |    "source": [
510 |     "expected_responses=dataset.to_pandas()['reference'].to_list()"
511 |    ]
512 |   },
513 |   {
514 |    "cell_type": "code",
515 |    "execution_count": 19,
516 |    "metadata": {},
517 |    "outputs": [],
518 |    "source": [
519 |     "dataset_to_eval=[]\n",
520 |     "\n",
521 |     "for query, reference in zip(sample_queries,expected_responses):\n",
522 |     "    rag_call_response=rag_function(query=query, db_path=\"../../Assets/Data/\", index_name=\"CWC_index\")\n",
523 |     "    relevant_docs=rag_call_response[0]\n",
524 |     "    response=rag_call_response[1]\n",
525 |     "    dataset_to_eval.append(\n",
526 |     "        {\n",
527 |     "            \"user_input\":query,\n",
528 |     "            \"retrieved_contexts\":relevant_docs,\n",
529 |     "            \"response\":response,\n",
530 |     "            \"reference\":reference\n",
531 |     "        }\n",
532 |     "    )\n"
533 |    ]
534 |   },
535 |   {
536 |    "cell_type": "code",
537 |    "execution_count": 21,
538 |    "metadata": {},
539 |    "outputs": [],
540 |    "source": [
541 |     "from ragas import EvaluationDataset\n",
542 |     "evaluation_dataset = EvaluationDataset.from_list(dataset_to_eval)\n"
543 |    ]
544 |   },
545 |   {
546 |    "cell_type": "code",
547 |    "execution_count": 22,
548 |    "metadata": {},
549 |    "outputs": [
550 |     {
551 |      "name": "stderr",
552 |      "output_type": "stream",
553 |      "text": [
554 |       "Evaluating: 100%|██████████| 50/50 [00:40<00:00,  1.23it/s]\n"
555 |      ]
556 |     },
557 |     {
558 |      "data": {
559 |       "text/plain": [
560 |        "{'context_recall': 0.3867, 'faithfulness': 0.8000, 'answer_correctness': 0.5802, 'answer_relevancy': 0.5674, 'factual_correctness': 0.3810}"
561 |       ]
562 |      },
563 |      "execution_count": 22,
564 |      "metadata": {},
565 |      "output_type": "execute_result"
566 |     }
567 |    ],
568 |    "source": [
569 |     "from ragas import evaluate\n",
570 |     "\n",
571 |     "evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model=\"gpt-4o-mini\"))\n",
572 |     "\n",
573 |     "from ragas.metrics import LLMContextRecall, Faithfulness, FactualCorrectness, AnswerCorrectness, ResponseRelevancy\n",
574 |     "\n",
575 |     "result = evaluate(dataset=evaluation_dataset,metrics=[LLMContextRecall(), Faithfulness(), AnswerCorrectness(), ResponseRelevancy(), FactualCorrectness()],llm=evaluator_llm)\n"
576 |    ]
577 |   },
578 |   {
579 |    "cell_type": "code",
580 |    "execution_count": 23,
581 |    "metadata": {},
582 |    "outputs": [
583 |     {
584 |      "name": "stdout",
585 |      "output_type": "stream",
586 |      "text": [
587 |       "{'context_recall': 0.3867, 'faithfulness': 0.8000, 'answer_correctness': 0.5802, 'answer_relevancy': 0.5674, 'factual_correctness': 0.3810}\n"
588 |      ]
589 |     }
590 |    ],
591 |    "source": [
592 |     "print(result)"
593 |    ]
594 |   },
595 |   {
596 |    "cell_type": "markdown",
597 |    "metadata": {},
598 |    "source": [
599 |     "___\n",
600 |     "You can interpret the results above. Looks like we are performing well on __faithfulness__ but other metrics are low. How to improve the metrics? We will look at advanced pre-retrieval, retrieval and post retrieval strategies in the next chapter."
601 |    ]
602 |   },
603 |   {
604 |    "cell_type": "markdown",
605 |    "metadata": {},
606 |    "source": [
607 |     "---"
608 |    ]
609 |   },
610 |   {
611 |    "cell_type": "markdown",
612 |    "metadata": {},
613 |    "source": [
614 |     "<img src=\"../../Assets/Images/profile_s.png\" width=100> \n",
615 |     "\n",
616 |     "Hi! I'm Abhinav! I am an entrepreneur and Vice President of Artificial Intelligence at Yarnit. I have spent over 15 years consulting and leadership roles in data science, machine learning and AI. My current focus is in the applied Generative AI domain focussing on solving enterprise needs through contextual intelligence. I'm passionate about AI advancements constantly exploring emerging technologies to push the boundaries and create positive impacts in the world. Let’s build the future, together!\n",
617 |     "\n",
618 |     "[If you haven't already, please subscribe to the MEAP of A Simple Guide to Retrieval Augmented Generation here](https://mng.bz/8wdg)\n",
619 |     "\n",
620 |     "<a href=\"https://mng.bz/8wdg\" target=\"_blank\">\n",
621 |     "    <img src=\"../../Assets/Images/NewMEAPFooter.png\" alt=\"New MEAP\" style=\"width: 100%;\" />\n",
622 |     "</a>\n",
623 |     "\n",
624 |     "#### If you'd like to chat, I'd be very happy to connect\n",
625 |     "\n",
626 |     "[![GitHub followers](https://img.shields.io/badge/Github-000000?style=for-the-badge&logo=github&logoColor=black&color=orange)](https://github.com/abhinav-kimothi)\n",
627 |     "[![LinkedIn](https://img.shields.io/badge/LinkedIn-000000?style=for-the-badge&logo=linkedin&logoColor=orange&color=black)](https://www.linkedin.com/comm/mynetwork/discovery-see-all?usecase=PEOPLE_FOLLOWS&followMember=abhinav-kimothi)\n",
628 |     "[![Medium](https://img.shields.io/badge/Medium-000000?style=for-the-badge&logo=medium&logoColor=black&color=orange)](https://medium.com/@abhinavkimothi)\n",
629 |     "[![Insta](https://img.shields.io/badge/Instagram-000000?style=for-the-badge&logo=instagram&logoColor=orange&color=black)](https://www.instagram.com/akaiworks/)\n",
630 |     "[![Mail](https://img.shields.io/badge/email-000000?style=for-the-badge&logo=gmail&logoColor=black&color=orange)](mailto:abhinav.kimothi.ds@gmail.com)\n",
631 |     "[![X](https://img.shields.io/badge/Follow-000000?style=for-the-badge&logo=X&logoColor=orange&color=black)](https://twitter.com/abhinav_kimothi)\n",
632 |     "[![Linktree](https://img.shields.io/badge/Linktree-000000?style=for-the-badge&logo=linktree&logoColor=black&color=orange)](https://linktr.ee/abhinavkimothi)\n",
633 |     "[![Gumroad](https://img.shields.io/badge/Gumroad-000000?style=for-the-badge&logo=gumroad&logoColor=orange&color=black)](https://abhinavkimothi.gumroad.com/)\n",
634 |     "\n",
635 |     "---"
636 |    ]
637 |   },
638 |   {
639 |    "cell_type": "markdown",
640 |    "metadata": {},
641 |    "source": []
642 |   }
643 |  ],
644 |  "metadata": {
645 |   "kernelspec": {
646 |    "display_name": ".ch5",
647 |    "language": "python",
648 |    "name": "python3"
649 |   },
650 |   "language_info": {
651 |    "codemirror_mode": {
652 |     "name": "ipython",
653 |     "version": 3
654 |    },
655 |    "file_extension": ".py",
656 |    "mimetype": "text/x-python",
657 |    "name": "python",
658 |    "nbconvert_exporter": "python",
659 |    "pygments_lexer": "ipython3",
660 |    "version": "3.13.2"
661 |   }
662 |  },
663 |  "nbformat": 4,
664 |  "nbformat_minor": 2
665 | }
666 | 


--------------------------------------------------------------------------------
/Chapters/Chapter-06/advanced_rag.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "<a href=\"https://mng.bz/8wdg\" target=\"_blank\">\n",
   8 |     "    <img src=\"../../Assets/Images/NewMEAPHeader.png\" alt=\"New MEAP\" style=\"width: 100%;\" />\n",
   9 |     "</a>"
  10 |    ]
  11 |   },
  12 |   {
  13 |    "cell_type": "markdown",
  14 |    "metadata": {},
  15 |    "source": [
  16 |     "# Chapter 06 - Progression of RAG Systems: Naïve to Advanced, and Modular RAG"
  17 |    ]
  18 |   },
  19 |   {
  20 |    "cell_type": "markdown",
  21 |    "metadata": {},
  22 |    "source": [
  23 |     "We have familiarized ourselves with the utility of RAG along with the development and evaluation of a basic RAG system. The basic, or the Naïve RAG approach that we have seen so far is, generally, inadequate when it comes to production-grade systems."
  24 |    ]
  25 |   },
  26 |   {
  27 |    "cell_type": "markdown",
  28 |    "metadata": {},
  29 |    "source": [
  30 |     "<a href=\"https://mng.bz/8wdg\" target=\"_blank\">\n",
  31 |     "    <img src=\"../../Assets/Images/6.1.png\" alt=\"Naive RAG Challenges\" style=\"width: 100%;\" />\n",
  32 |     "</a>\n"
  33 |    ]
  34 |   },
  35 |   {
  36 |    "cell_type": "markdown",
  37 |    "metadata": {},
  38 |    "source": [
  39 |     "In this chapter we will focus on more advanced concepts in RAG that make RAG possible in production. Let's begin by installing dependencies."
  40 |    ]
  41 |   },
  42 |   {
  43 |    "cell_type": "markdown",
  44 |    "metadata": {},
  45 |    "source": [
  46 |     "## Installing Dependencies"
  47 |    ]
  48 |   },
  49 |   {
  50 |    "cell_type": "markdown",
  51 |    "metadata": {},
  52 |    "source": [
  53 |     "All the necessary libraries for running this notebook along with their versions can be found in __requirements.txt__ file in the root directory of this repository\n",
  54 |     "\n",
  55 |     "You should go to the root directory and run the following command to install the libraries\n",
  56 |     "\n",
  57 |     "```\n",
  58 |     "pip install -r requirements.txt\n",
  59 |     "```\n",
  60 |     "\n",
  61 |     "This is the recommended method of installing the dependencies\n",
  62 |     "\n",
  63 |     "___\n",
  64 |     "Alternatively, you can run the command from this notebook too. The relative path may vary"
  65 |    ]
  66 |   },
  67 |   {
  68 |    "cell_type": "code",
  69 |    "execution_count": 2,
  70 |    "metadata": {},
  71 |    "outputs": [
  72 |     {
  73 |      "name": "stdout",
  74 |      "output_type": "stream",
  75 |      "text": [
  76 |       "\n",
  77 |       "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m25.0\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m25.0.1\u001b[0m\n",
  78 |       "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n",
  79 |       "Note: you may need to restart the kernel to use updated packages.\n"
  80 |      ]
  81 |     }
  82 |    ],
  83 |    "source": [
  84 |     "%pip install -r ../../requirements.txt --quiet"
  85 |    ]
  86 |   },
  87 |   {
  88 |    "cell_type": "markdown",
  89 |    "metadata": {},
  90 |    "source": [
  91 |     "## Advanced RAG Techniques"
  92 |    ]
  93 |   },
  94 |   {
  95 |    "cell_type": "markdown",
  96 |    "metadata": {},
  97 |    "source": [
  98 |     "Advanced techniques in RAG have continued to emerge since the earliest experiments with Naïve RAG. There are three stages in which we can discuss these techniques – \n",
  99 |     "1.\tPre-retrieval Stage: Like the name suggests, there are certain interventions that can be employed before the retriever comes into action. This broadly covers two aspects \n",
 100 |     "    - Index Optimization – The way documents are stored in the knowledge base\n",
 101 |     "    - Query Optimization – Optimizing the user query so it aligns better to the retrieval and generation tasks\n",
 102 |     "2.\tRetrieval Stage: Certain strategies can improve the recall and precision of the retrieval process. This goes beyond the capability of the underlying retrieval algorithms that we discussed in Chapter 4.\n",
 103 |     "3.\tPost-retrieval Stage: Once the information has been retrieved, the context can be further optimized to better align with the generation task and the downstream LLM.\n"
 104 |    ]
 105 |   },
 106 |   {
 107 |    "cell_type": "markdown",
 108 |    "metadata": {},
 109 |    "source": [
 110 |     "<a href=\"https://mng.bz/8wdg\" target=\"_blank\">\n",
 111 |     "    <img src=\"../../Assets/Images/6.2.png\" alt=\"Naive RAG Challenges\" style=\"width: 50%;\" />\n",
 112 |     "</a>"
 113 |    ]
 114 |   },
 115 |   {
 116 |    "cell_type": "markdown",
 117 |    "metadata": {},
 118 |    "source": [
 119 |     "We will explore these techniques one by one."
 120 |    ]
 121 |   },
 122 |   {
 123 |    "cell_type": "markdown",
 124 |    "metadata": {},
 125 |    "source": [
 126 |     "To initialize the __OpenAI client__, we need to pass the api key.\n",
 127 |     "\n",
 128 |     "Creating a .env file for storing the API key and using it # Recommended\n",
 129 |     "\n",
 130 |     "Install the __dotenv__ library\n",
 131 |     "\n",
 132 |     "_The dotenv library is a popular tool used in various programming languages, including Python and Node.js, to manage environment variables in development and deployment environments. It allows developers to load environment variables from a .env file into their application's environment._\n",
 133 |     "\n",
 134 |     "- Create a file named .env in the root directory of their project.\n",
 135 |     "- Inside the .env file, then define environment variables in the format VARIABLE_NAME=value. \n",
 136 |     "\n",
 137 |     "e.g.\n",
 138 |     "\n",
 139 |     "OPENAI_API_KEY=YOUR API KEY"
 140 |    ]
 141 |   },
 142 |   {
 143 |    "cell_type": "code",
 144 |    "execution_count": 3,
 145 |    "metadata": {},
 146 |    "outputs": [
 147 |     {
 148 |      "name": "stdout",
 149 |      "output_type": "stream",
 150 |      "text": [
 151 |       "Success: .env file found with some environment variables\n"
 152 |      ]
 153 |     }
 154 |    ],
 155 |    "source": [
 156 |     "from dotenv import load_dotenv\n",
 157 |     "import os\n",
 158 |     "\n",
 159 |     "if load_dotenv():\n",
 160 |     "    print(\"Success: .env file found with some environment variables\")\n",
 161 |     "else:\n",
 162 |     "    print(\"Caution: No environment variables found. Please create .env file in the root directory or add environment variables in the .env file\")"
 163 |    ]
 164 |   },
 165 |   {
 166 |    "cell_type": "code",
 167 |    "execution_count": 4,
 168 |    "metadata": {},
 169 |    "outputs": [
 170 |     {
 171 |      "name": "stdout",
 172 |      "output_type": "stream",
 173 |      "text": [
 174 |       "OPENAI_API_KEY is set and is valid\n"
 175 |      ]
 176 |     }
 177 |    ],
 178 |    "source": [
 179 |     "api_key=os.environ[\"OPENAI_API_KEY\"]\n",
 180 |     "\n",
 181 |     "from openai import OpenAI\n",
 182 |     "\n",
 183 |     "client = OpenAI()\n",
 184 |     "\n",
 185 |     "\n",
 186 |     "if api_key:\n",
 187 |     "    try:\n",
 188 |     "        client.models.list()\n",
 189 |     "        print(\"OPENAI_API_KEY is set and is valid\")\n",
 190 |     "    except openai.APIError as e:\n",
 191 |     "        print(f\"OpenAI API returned an API Error: {e}\")\n",
 192 |     "        pass\n",
 193 |     "    except openai.APIConnectionError as e:\n",
 194 |     "        print(f\"Failed to connect to OpenAI API: {e}\")\n",
 195 |     "        pass\n",
 196 |     "    except openai.RateLimitError as e:\n",
 197 |     "        print(f\"OpenAI API request exceeded rate limit: {e}\")\n",
 198 |     "        pass\n",
 199 |     "\n",
 200 |     "else:\n",
 201 |     "    print(\"Please set you OpenAI API key as an environment variable OPENAI_API_KEY\")\n",
 202 |     "\n"
 203 |    ]
 204 |   },
 205 |   {
 206 |    "cell_type": "markdown",
 207 |    "metadata": {},
 208 |    "source": [
 209 |     "## 1. Pre-retrieval Stage\n",
 210 |     "\n",
 211 |     "The primary objective of employing pre-retrieval techniques is to facilitate better retrieval. Retrieval failures can happen because of 2 reasons.\n",
 212 |     "    \n",
 213 |     "- Knowledge Base is not suited for retrieval\n",
 214 |     "    \n",
 215 |     "- Retriever doesn’t completely understand the input query"
 216 |    ]
 217 |   },
 218 |   {
 219 |    "cell_type": "markdown",
 220 |    "metadata": {},
 221 |    "source": [
 222 |     "### 1.1 INDEX OPTIMIZATION\n",
 223 |     "\n",
 224 |     "The objective of index Optimization is to set up the knowledge base for better retrieval. "
 225 |    ]
 226 |   },
 227 |   {
 228 |    "cell_type": "markdown",
 229 |    "metadata": {},
 230 |    "source": [
 231 |     "#### Context Enriched Chunking\n",
 232 |     "\n",
 233 |     "This method adds the summary of the larger document to each chunk to enrich the context of the smaller chunk"
 234 |    ]
 235 |   },
 236 |   {
 237 |    "cell_type": "code",
 238 |    "execution_count": 5,
 239 |    "metadata": {},
 240 |    "outputs": [],
 241 |    "source": [
 242 |     "# Import FAISS class from vectorstore library\n",
 243 |     "from langchain_community.vectorstores import FAISS\n",
 244 |     "\n",
 245 |     "# Import OpenAIEmbeddings from the library\n",
 246 |     "from langchain_openai import OpenAIEmbeddings\n",
 247 |     "\n",
 248 |     "# Instantiate the embeddings object\n",
 249 |     "embeddings=OpenAIEmbeddings(model=\"text-embedding-3-small\")\n"
 250 |    ]
 251 |   },
 252 |   {
 253 |    "cell_type": "code",
 254 |    "execution_count": 6,
 255 |    "metadata": {},
 256 |    "outputs": [
 257 |     {
 258 |      "name": "stderr",
 259 |      "output_type": "stream",
 260 |      "text": [
 261 |       "USER_AGENT environment variable not set, consider setting it to identify your requests.\n"
 262 |      ]
 263 |     }
 264 |    ],
 265 |    "source": [
 266 |     "from langchain_community.document_loaders import AsyncHtmlLoader\n",
 267 |     "from langchain_community.document_transformers import Html2TextTransformer"
 268 |    ]
 269 |   },
 270 |   {
 271 |    "cell_type": "code",
 272 |    "execution_count": 7,
 273 |    "metadata": {},
 274 |    "outputs": [],
 275 |    "source": [
 276 |     "url=\"https://en.wikipedia.org/wiki/2023_Cricket_World_Cup\""
 277 |    ]
 278 |   },
 279 |   {
 280 |    "cell_type": "code",
 281 |    "execution_count": 8,
 282 |    "metadata": {},
 283 |    "outputs": [
 284 |     {
 285 |      "name": "stderr",
 286 |      "output_type": "stream",
 287 |      "text": [
 288 |       "Fetching pages:   0%|          | 0/1 [00:00<?, ?it/s]"
 289 |      ]
 290 |     },
 291 |     {
 292 |      "name": "stderr",
 293 |      "output_type": "stream",
 294 |      "text": [
 295 |       "Fetching pages: 100%|##########| 1/1 [00:00<00:00,  3.24it/s]\n"
 296 |      ]
 297 |     }
 298 |    ],
 299 |    "source": [
 300 |     "loader = AsyncHtmlLoader (url)\n",
 301 |     "data = loader.load()\n",
 302 |     "html2text = Html2TextTransformer()\n",
 303 |     "data_transformed = html2text.transform_documents(data)"
 304 |    ]
 305 |   },
 306 |   {
 307 |    "cell_type": "code",
 308 |    "execution_count": 9,
 309 |    "metadata": {},
 310 |    "outputs": [],
 311 |    "source": [
 312 |     "document_text=data_transformed[0].page_content"
 313 |    ]
 314 |   },
 315 |   {
 316 |    "cell_type": "code",
 317 |    "execution_count": 10,
 318 |    "metadata": {},
 319 |    "outputs": [],
 320 |    "source": [
 321 |     "summary_prompt = f\"Summarize the given document in a single paragraph\\ndocument: {document_text}\""
 322 |    ]
 323 |   },
 324 |   {
 325 |    "cell_type": "code",
 326 |    "execution_count": 11,
 327 |    "metadata": {},
 328 |    "outputs": [
 329 |     {
 330 |      "name": "stdout",
 331 |      "output_type": "stream",
 332 |      "text": [
 333 |       "The 2023 ICC Men's Cricket World Cup, held in India from October 5 to November\n",
 334 |       "19, 2023, marked the 13th edition of this prestigious One Day International\n",
 335 |       "(ODI) tournament, featuring ten national teams. Australia emerged as champions,\n",
 336 |       "claiming their sixth title by defeating India in the final at the Narendra Modi\n",
 337 |       "Stadium in Ahmedabad. The tournament attracted a record attendance of over 1.25\n",
 338 |       "million spectators and set viewership records in India, with 518 million viewers\n",
 339 |       "for the final. Virat Kohli was named Player of the Series, scoring the most runs\n",
 340 |       "(765), while Mohammed Shami led in wickets taken (24). The event was initially\n",
 341 |       "scheduled for early 2023 but was postponed due to the COVID-19 pandemic, and it\n",
 342 |       "introduced new penalties for slow over-rates. The tournament format included a\n",
 343 |       "round-robin group stage followed by knockout rounds, culminating in a highly\n",
 344 |       "anticipated final.\n"
 345 |      ]
 346 |     }
 347 |    ],
 348 |    "source": [
 349 |     "# Importing the ChatOpenAI library\n",
 350 |     "from langchain_openai import ChatOpenAI\n",
 351 |     "\n",
 352 |     "# Set up LLM \n",
 353 |     "llm = ChatOpenAI(\n",
 354 |     "    model=\"gpt-4o-mini\",\n",
 355 |     "    temperature=0,\n",
 356 |     "    max_tokens=None,\n",
 357 |     "    timeout=None,\n",
 358 |     "    max_retries=2\n",
 359 |     ")\n",
 360 |     "\n",
 361 |     "#Craft the prompt message\n",
 362 |     "messages=[(\"human\",summary_prompt)]\n",
 363 |     "\n",
 364 |     "\n",
 365 |     "# Invoke the LLM\n",
 366 |     "ai_msg = llm.invoke(messages)\n",
 367 |     "\n",
 368 |     "# Extract the answer from the response object\n",
 369 |     "answer=ai_msg.content\n",
 370 |     "\n",
 371 |     "import textwrap\n",
 372 |     "print(textwrap.fill(answer, width=80))"
 373 |    ]
 374 |   },
 375 |   {
 376 |    "cell_type": "code",
 377 |    "execution_count": 12,
 378 |    "metadata": {},
 379 |    "outputs": [],
 380 |    "source": [
 381 |     "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
 382 |     "#Set the CharacterTextSplitter parameters\n",
 383 |     "text_splitter = RecursiveCharacterTextSplitter(\n",
 384 |     "chunk_size=1000, #Number of characters in each chunk \n",
 385 |     "chunk_overlap=200, #Number of overlapping characters between chunks\n",
 386 |     ")\n",
 387 |     "#Create Chunks\n",
 388 |     "chunks=text_splitter.split_text(data_transformed[0].page_content)"
 389 |    ]
 390 |   },
 391 |   {
 392 |    "cell_type": "code",
 393 |    "execution_count": 17,
 394 |    "metadata": {},
 395 |    "outputs": [
 396 |     {
 397 |      "name": "stdout",
 398 |      "output_type": "stream",
 399 |      "text": [
 400 |       "70\n"
 401 |      ]
 402 |     }
 403 |    ],
 404 |    "source": [
 405 |     "print(len(chunks))"
 406 |    ]
 407 |   },
 408 |   {
 409 |    "cell_type": "code",
 410 |    "execution_count": null,
 411 |    "metadata": {},
 412 |    "outputs": [],
 413 |    "source": [
 414 |     "context_enriched_chunks = [answer + \"\\n\" + chunk for chunk in chunks]"
 415 |    ]
 416 |   },
 417 |   {
 418 |    "cell_type": "code",
 419 |    "execution_count": 19,
 420 |    "metadata": {},
 421 |    "outputs": [
 422 |     {
 423 |      "name": "stdout",
 424 |      "output_type": "stream",
 425 |      "text": [
 426 |       "70\n"
 427 |      ]
 428 |     }
 429 |    ],
 430 |    "source": [
 431 |     "print(len(context_enriched_chunks))"
 432 |    ]
 433 |   },
 434 |   {
 435 |    "cell_type": "code",
 436 |    "execution_count": 22,
 437 |    "metadata": {},
 438 |    "outputs": [
 439 |     {
 440 |      "data": {
 441 |       "text/plain": [
 442 |        "['0fe322d5-19c7-435d-9e31-0816850fcb86',\n",
 443 |        " '42343421-501e-4057-bec9-ae4401a403e1',\n",
 444 |        " 'bf8d26d4-eb96-44a7-ac81-65cacd7f9cca',\n",
 445 |        " '50f75d67-5b7a-485b-9b55-ea5b9a0a552f',\n",
 446 |        " 'd23a17e1-79f8-466d-8074-e90fdee1c47a',\n",
 447 |        " 'd14b38ca-f394-4212-95e6-c5aa97a2b37e',\n",
 448 |        " '1f6dc75b-d2e4-452b-881a-ca8fbd4bf77e',\n",
 449 |        " '2199d18e-76e0-4677-8e16-b0676e3622d5',\n",
 450 |        " '50cec4ba-a827-4794-b8f9-30d096bdd51d',\n",
 451 |        " '72d7001d-8e30-42c7-8f3c-c1eeaf28aee1',\n",
 452 |        " '8462aab0-71ff-4e87-b773-eb3d86e2f37b',\n",
 453 |        " '7aeacd45-b1b5-45fd-93c4-9888aaeeb637',\n",
 454 |        " 'efcd743c-ec80-4523-b20c-ada55f374a98',\n",
 455 |        " '4650499b-81b4-44f0-b39e-3d7e24a59a37',\n",
 456 |        " '9b4ab0ab-79ed-441a-8d46-646e274056c2',\n",
 457 |        " 'b8373fad-f7eb-4f6c-bbf7-4e3519a1aec9',\n",
 458 |        " 'f0c8b2b7-277f-41fb-9858-a70e819caea4',\n",
 459 |        " '6bf6fc2a-7f28-4c92-8ad1-6469420eecbc',\n",
 460 |        " 'ec1c567d-6ba9-41a6-b344-1bd5b23eef73',\n",
 461 |        " '39b6c881-3713-4f90-8def-2b56fd69815b',\n",
 462 |        " 'c75e669b-4a23-4a6c-aa74-780b63ec84ab',\n",
 463 |        " '9ebca45a-1b5a-4d3a-805f-e5b3ea0cff3a',\n",
 464 |        " 'bd023640-0d35-4716-b746-71c54d2b9b0c',\n",
 465 |        " '8e3bd3f2-dd1f-4e21-8833-2a5d0b736032',\n",
 466 |        " '42fbefb0-a11d-43a5-b9a6-dbb6c153af21',\n",
 467 |        " '183ee627-375b-4b24-bf0c-0c8627b1d8c7',\n",
 468 |        " '6032380b-b064-4968-a694-39919159c1a2',\n",
 469 |        " '14a1841f-90d5-4e15-88c4-e4da48af7830',\n",
 470 |        " 'b1ee23e0-53ed-42e1-a34e-0210caef9cbd',\n",
 471 |        " '16b7fa5e-c374-4d87-aac9-9fe495fbc8a8',\n",
 472 |        " 'eece6b59-99f8-4468-b57f-5c84ded4bc06',\n",
 473 |        " '10810e01-b7dc-4f4e-85f9-fd335e325c2c',\n",
 474 |        " '9429cb92-30fa-408e-b1d7-063ac6ada493',\n",
 475 |        " 'b55b86ca-93c7-4c69-9aa6-a6e6cea3de9f',\n",
 476 |        " '4a2df73e-cf72-42fb-b67c-8ba354b572a6',\n",
 477 |        " '6f9a95b7-f9a6-4dac-a1a6-174e8bbfd2d3',\n",
 478 |        " '6548cf91-4467-4274-8756-d01825aaa652',\n",
 479 |        " 'e59fb3e3-4982-4a4e-8896-a2f7a777c94f',\n",
 480 |        " 'a749c7f7-9371-4e21-8559-4e343591d7e4',\n",
 481 |        " 'e32992b2-1ae5-49fa-a5c2-2a2645c4ac03',\n",
 482 |        " '31dbff81-392d-4b21-866b-bc0bc8cad00b',\n",
 483 |        " '509e1257-c16c-428f-b841-772a8db7c926',\n",
 484 |        " '7fc61771-1282-454e-8f4b-5e31ec355a0b',\n",
 485 |        " '23442577-8371-4474-8c66-e8dca0e926ad',\n",
 486 |        " 'e15091e4-6f74-4b5f-9234-53e13bbff34e',\n",
 487 |        " '0e80ddd7-86da-42a1-b2e1-c0a9580f4098',\n",
 488 |        " '1dcb22a0-62ef-45ad-9ca5-607ed79d62df',\n",
 489 |        " '4c78f3a3-d883-4beb-a836-0da36b25aea8',\n",
 490 |        " 'face26c1-a979-4d32-a7e1-415991c0d087',\n",
 491 |        " 'ed3b731b-3f21-4058-99aa-167b4dd8cdb2',\n",
 492 |        " '6ed4141a-834c-4311-b028-b7a5063e1161',\n",
 493 |        " 'aacc22dd-504d-4e2d-8981-846875fcbd5f',\n",
 494 |        " '3a67dd6f-6935-4bcb-b06d-7a30176e98e7',\n",
 495 |        " 'c2925484-3d90-4f6d-b5a1-2f22909e5025',\n",
 496 |        " '99a9ba1b-3d13-4250-9a2c-cebd472743e0',\n",
 497 |        " 'e3478082-6778-4e26-aa48-cfa4d01b18d2',\n",
 498 |        " '3dd97df1-9a0a-4d6d-a117-b8546ef537e9',\n",
 499 |        " 'f2bf587b-e647-4265-81ac-c8a9916836ae',\n",
 500 |        " '99ce921d-d161-4ace-8a45-b138ea0968e6',\n",
 501 |        " 'ed1cd052-2e90-448b-a06e-c712c95d61e1',\n",
 502 |        " '044636f0-ba2e-40ce-a71f-b6f2a3ca5dd0',\n",
 503 |        " 'e8649051-057d-44ee-936d-63c1571b6f45',\n",
 504 |        " '5e96adf4-3ef8-428e-b205-61cb0459425e',\n",
 505 |        " '59296bbd-033b-455f-8b69-67704ef9f3dc',\n",
 506 |        " '32f23436-5212-4638-a037-cb738ac1c0fa',\n",
 507 |        " '104b9ef4-9209-49cd-8bef-1c4022984b33',\n",
 508 |        " '55755505-7ad3-4283-b11b-23e2023d8624',\n",
 509 |        " '4bf95ae4-1239-4e75-be8e-918cc8ee048a',\n",
 510 |        " '7aa1e6ac-4364-4c75-b244-5b3895225759',\n",
 511 |        " '4808314c-7f9a-42c4-a490-6d709ec92c04']"
 512 |       ]
 513 |      },
 514 |      "execution_count": 22,
 515 |      "metadata": {},
 516 |      "output_type": "execute_result"
 517 |     }
 518 |    ],
 519 |    "source": [
 520 |     "from langchain_openai import OpenAIEmbeddings\n",
 521 |     "\n",
 522 |     "# Instantiate the embeddings object\n",
 523 |     "embeddings = OpenAIEmbeddings(model=\"text-embedding-3-small\")\n",
 524 |     "\n",
 525 |     "import faiss\n",
 526 |     "from langchain_community.docstore.in_memory import InMemoryDocstore\n",
 527 |     "from langchain_community.vectorstores import FAISS\n",
 528 |     "\n",
 529 |     "index = faiss.IndexFlatIP(1536)\n",
 530 |     "\n",
 531 |     "vector_store = FAISS(\n",
 532 |     "    embedding_function=embeddings,\n",
 533 |     "    index=index,\n",
 534 |     "    docstore=InMemoryDocstore(),\n",
 535 |     "    index_to_docstore_id={},\n",
 536 |     ")\n",
 537 |     "\n",
 538 |     "vector_store.add_texts(texts=context_enriched_chunks)"
 539 |    ]
 540 |   },
 541 |   {
 542 |    "cell_type": "code",
 543 |    "execution_count": 23,
 544 |    "metadata": {},
 545 |    "outputs": [],
 546 |    "source": [
 547 |     "query = \"What records did Virat Kohli make?\"\n",
 548 |     "retrieved_docs = vector_store.similarity_search(query, k=2)\n"
 549 |    ]
 550 |   },
 551 |   {
 552 |    "cell_type": "code",
 553 |    "execution_count": 26,
 554 |    "metadata": {},
 555 |    "outputs": [
 556 |     {
 557 |      "name": "stdout",
 558 |      "output_type": "stream",
 559 |      "text": [
 560 |       "The 2023 ICC Men's Cricket World Cup, held in India from October 5 to November 19, 2023, marked the 13th edition of this prestigious One Day International (ODI) tournament, featuring ten national teams. Australia emerged as champions, claiming their sixth title by defeating India in the final at the Narendra Modi Stadium in Ahmedabad. The tournament attracted a record attendance of over 1.25 million spectators and set viewership records in India, with 518 million viewers for the final. Virat Kohli was named Player of the Series, scoring the most runs (765), while Mohammed Shami led in wickets taken (24). The event was initially scheduled for early 2023 but was postponed due to the COVID-19 pandemic, and it introduced new penalties for slow over-rates. The tournament format included a round-robin group stage followed by knockout rounds, culminating in a highly anticipated final.\n",
 561 |       "Main article: 2023 Cricket World Cup final\n",
 562 |       "\n",
 563 |       "19 November 2023  \n",
 564 |       "14:00 (D/N)  \n",
 565 |       "Scorecard  \n",
 566 |       "---  \n",
 567 |       "**India **  \n",
 568 |       "240 (50 overs) | **v** | **Australia**  \n",
 569 |       "241/4 (43 overs)  \n",
 570 |       "---|---|---  \n",
 571 |       "|  |   \n",
 572 |       "**Australia won by 6 wickets**  \n",
 573 |       "Narendra Modi Stadium, Ahmedabad  \n",
 574 |       "---  \n",
 575 |       "  \n",
 576 |       "## Statistics\n",
 577 |       "\n",
 578 |       "Main article: 2023 Cricket World Cup statistics\n",
 579 |       "\n",
 580 |       "### Most runs\n",
 581 |       "\n",
 582 |       "Runs  | Player  | Team   \n",
 583 |       "---|---|---  \n",
 584 |       "765  | Virat Kohli |  India  \n",
 585 |       "597  | Rohit Sharma |  India  \n",
 586 |       "594  | Quinton de Kock |  South Africa  \n",
 587 |       "578  | Rachin Ravindra |  New Zealand  \n",
 588 |       "552  | Daryl Mitchell |  New Zealand  \n",
 589 |       "  \n",
 590 |       "  * Source: ESPNcricinfo[47]\n",
 591 |       "\n",
 592 |       "### Most wickets\n",
 593 |       "\n",
 594 |       "Wickets  | Player  | Team   \n",
 595 |       "---|---|---  \n",
 596 |       "24  | Mohammed Shami |  India  \n",
 597 |       "23  | Adam Zampa |  Australia  \n",
 598 |       "21  | Dilshan Madushanka |  Sri Lanka  \n",
 599 |       "20  | Jasprit Bumrah |  India  \n",
 600 |       "20  | Gerald Coetzee |  South Africa  \n",
 601 |       "  \n",
 602 |       "  * Source: ESPNcricinfo[48]\n",
 603 |       "\n",
 604 |       "### Team of the tournament\n"
 605 |      ]
 606 |     }
 607 |    ],
 608 |    "source": [
 609 |     "print(retrieved_docs[0].page_content)"
 610 |    ]
 611 |   },
 612 |   {
 613 |    "cell_type": "markdown",
 614 |    "metadata": {},
 615 |    "source": [
 616 |     "---\n"
 617 |    ]
 618 |   },
 619 |   {
 620 |    "cell_type": "markdown",
 621 |    "metadata": {},
 622 |    "source": [
 623 |     "#### Metadata Enhancement\n",
 624 |     "\n",
 625 |     "This method adds the summary of the larger document to each chunk to enrich the context of the smaller chunk"
 626 |    ]
 627 |   },
 628 |   {
 629 |    "cell_type": "code",
 630 |    "execution_count": 27,
 631 |    "metadata": {},
 632 |    "outputs": [
 633 |     {
 634 |      "name": "stderr",
 635 |      "output_type": "stream",
 636 |      "text": [
 637 |       "Fetching pages:   0%|          | 0/1 [00:00<?, ?it/s]"
 638 |      ]
 639 |     },
 640 |     {
 641 |      "name": "stderr",
 642 |      "output_type": "stream",
 643 |      "text": [
 644 |       "Fetching pages: 100%|##########| 1/1 [00:00<00:00,  2.65it/s]\n"
 645 |      ]
 646 |     },
 647 |     {
 648 |      "name": "stdout",
 649 |      "output_type": "stream",
 650 |      "text": [
 651 |       "{\n",
 652 |       "  \"player_1\": \"Virat Kohli\",\n",
 653 |       "  \"player_2\": \"Mohammed Shami\",\n",
 654 |       "  \"player_3\": \"\",\n",
 655 |       "  \"player_4\": \"\",\n",
 656 |       "  \"player_5\": \"\",\n",
 657 |       "  \"team_1\": \"Australia\",\n",
 658 |       "  \"team_2\": \"India\",\n",
 659 |       "  \"team_3\": \"New Zealand\",\n",
 660 |       "  \"team_4\": \"South Africa\",\n",
 661 |       "  \"team_5\": \"\",\n",
 662 |       "  \"keyword_1\": \"2023 Cricket World Cup\",\n",
 663 |       "  \"keyword_2\": \"One Day International\",\n",
 664 |       "  \"keyword_3\": \"International Cricket Council\",\n",
 665 |       "  \"keyword_4\": \"Knockout stage\",\n",
 666 |       "  \"keyword_5\": \"Prize money\"\n",
 667 |       "}\n",
 668 |       "{\n",
 669 |       "  \"player_1\": \"\",\n",
 670 |       "  \"player_2\": \"\",\n",
 671 |       "  \"player_3\": \"\",\n",
 672 |       "  \"player_4\": \"\",\n",
 673 |       "  \"player_5\": \"\",\n",
 674 |       "  \"team_1\": \"India\",\n",
 675 |       "  \"team_2\": \"Afghanistan\",\n",
 676 |       "  \"team_3\": \"Australia\",\n",
 677 |       "  \"team_4\": \"Bangladesh\",\n",
 678 |       "  \"team_5\": \"England\",\n",
 679 |       "  \"keyword_1\": \"2023 Cricket World Cup\",\n",
 680 |       "  \"keyword_2\": \"qualification\",\n",
 681 |       "  \"keyword_3\": \"Super League\",\n",
 682 |       "  \"keyword_4\": \"Sri Lanka\",\n",
 683 |       "  \"keyword_5\": \"Netherlands\"\n",
 684 |       "}\n",
 685 |       "{\n",
 686 |       "  \"player_1\": \"India\",\n",
 687 |       "  \"player_2\": \"South Africa\",\n",
 688 |       "  \"player_3\": \"Australia\",\n",
 689 |       "  \"player_4\": \"New Zealand\",\n",
 690 |       "  \"player_5\": \"Pakistan\",\n",
 691 |       "  \"team_1\": \"Afghanistan\",\n",
 692 |       "  \"team_2\": \"England\",\n",
 693 |       "  \"team_3\": \"Bangladesh\",\n",
 694 |       "  \"team_4\": \"Sri Lanka\",\n",
 695 |       "  \"team_5\": \"Netherlands\",\n",
 696 |       "  \"keyword_1\": \"2023 Cricket World Cup\",\n",
 697 |       "  \"keyword_2\": \"group stage\",\n",
 698 |       "  \"keyword_3\": \"semi-finals\",\n",
 699 |       "  \"keyword_4\": \"Champions Trophy\",\n",
 700 |       "  \"keyword_5\": \"Narendra Modi Stadium\"\n",
 701 |       "}\n",
 702 |       "{\n",
 703 |       "  \"player_1\": \"\",\n",
 704 |       "  \"player_2\": \"\",\n",
 705 |       "  \"player_3\": \"\",\n",
 706 |       "  \"player_4\": \"\",\n",
 707 |       "  \"player_5\": \"\",\n",
 708 |       "  \"team_1\": \"England\",\n",
 709 |       "  \"team_2\": \"New Zealand\",\n",
 710 |       "  \"team_3\": \"Pakistan\",\n",
 711 |       "  \"team_4\": \"Netherlands\",\n",
 712 |       "  \"team_5\": \"Afghanistan\",\n",
 713 |       "  \"keyword_1\": \"World Cup\",\n",
 714 |       "  \"keyword_2\": \"2023\",\n",
 715 |       "  \"keyword_3\": \"cricket\",\n",
 716 |       "  \"keyword_4\": \"results\",\n",
 717 |       "  \"keyword_5\": \"scorecard\"\n",
 718 |       "}\n",
 719 |       "{\n",
 720 |       "  \"player_1\": \"Virat Kohli\",\n",
 721 |       "  \"player_2\": \"Rohit Sharma\",\n",
 722 |       "  \"player_3\": \"Quinton de Kock\",\n",
 723 |       "  \"player_4\": \"Daryl Mitchell\",\n",
 724 |       "  \"player_5\": \"Mohammed Shami\",\n",
 725 |       "  \"team_1\": \"India\",\n",
 726 |       "  \"team_2\": \"Australia\",\n",
 727 |       "  \"team_3\": \"South Africa\",\n",
 728 |       "  \"team_4\": \"New Zealand\",\n",
 729 |       "  \"team_5\": \"Sri Lanka\",\n",
 730 |       "  \"keyword_1\": \"2023 Cricket World Cup\",\n",
 731 |       "  \"keyword_2\": \"semi-finals\",\n",
 732 |       "  \"keyword_3\": \"final\",\n",
 733 |       "  \"keyword_4\": \"most runs\",\n",
 734 |       "  \"keyword_5\": \"most wickets\"\n",
 735 |       "}\n",
 736 |       "{\n",
 737 |       "  \"player_1\": \"\",\n",
 738 |       "  \"player_2\": \"\",\n",
 739 |       "  \"player_3\": \"\",\n",
 740 |       "  \"player_4\": \"\",\n",
 741 |       "  \"player_5\": \"\",\n",
 742 |       "  \"team_1\": \"India\",\n",
 743 |       "  \"team_2\": \"Pakistan\",\n",
 744 |       "  \"team_3\": \"Australia\",\n",
 745 |       "  \"team_4\": \"New Zealand\",\n",
 746 |       "  \"team_5\": \"South Africa\",\n",
 747 |       "  \"keyword_1\": \"World Cup\",\n",
 748 |       "  \"keyword_2\": \"ODI\",\n",
 749 |       "  \"keyword_3\": \"T20\",\n",
 750 |       "  \"keyword_4\": \"Asia Cup\",\n",
 751 |       "  \"keyword_5\": \"ICC\"\n",
 752 |       "}\n",
 753 |       "{\n",
 754 |       "  \"player_1\": \"\",\n",
 755 |       "  \"player_2\": \"\",\n",
 756 |       "  \"player_3\": \"\",\n",
 757 |       "  \"player_4\": \"\",\n",
 758 |       "  \"player_5\": \"\",\n",
 759 |       "  \"team_1\": \"\",\n",
 760 |       "  \"team_2\": \"\",\n",
 761 |       "  \"team_3\": \"\",\n",
 762 |       "  \"team_4\": \"\",\n",
 763 |       "  \"team_5\": \"\",\n",
 764 |       "  \"keyword_1\": \"ICC Men's Cricket World Cup 2023\",\n",
 765 |       "  \"keyword_2\": \"vertical video feed\",\n",
 766 |       "  \"keyword_3\": \"ball tracking\",\n",
 767 |       "  \"keyword_4\": \"Disney India cricket viewership\",\n",
 768 |       "  \"keyword_5\": \"Broadcast and Digital records\"\n",
 769 |       "}\n",
 770 |       "{\n",
 771 |       "  \"player_1\": \"\",\n",
 772 |       "  \"player_2\": \"\",\n",
 773 |       "  \"player_3\": \"\",\n",
 774 |       "  \"player_4\": \"\",\n",
 775 |       "  \"player_5\": \"\",\n",
 776 |       "  \"team_1\": \"India\",\n",
 777 |       "  \"team_2\": \"Afghanistan\",\n",
 778 |       "  \"team_3\": \"Australia\",\n",
 779 |       "  \"team_4\": \"Bangladesh\",\n",
 780 |       "  \"team_5\": \"England\",\n",
 781 |       "  \"keyword_1\": \"2023 Cricket World Cup\",\n",
 782 |       "  \"keyword_2\": \"International Cricket Council\",\n",
 783 |       "  \"keyword_3\": \"CWC Qualifier\",\n",
 784 |       "  \"keyword_4\": \"Warm-up matches\",\n",
 785 |       "  \"keyword_5\": \"Knockout stage\"\n",
 786 |       "}\n",
 787 |       "Document: 1. **^** \"2023 ODI World Cup shatters viewership records\". _ESPNcricinfo_. Retrieved 15 February 2024.\n",
 788 |       "  2. **^** \"India to host 2023 World Cup, Champions Trophy in 2021\". _The Indian Express_. 11 December 2017. Retrieved 15 February 2024.\n",
 789 |       "  3. **^** \"Outcomes from ICC Annual Conference week in London\". _ICC_. Dubai: International Cricket Council. 13 June 2013. Archived from the original on 14 October 2017. Retrieved 22 June 2017.\n",
 790 |       "  4. **^** \"IPL now has window in ICC Future Tours Programme\". _ESPNcricinfo_. 12 December 2017. Archived from the original on 25 December 2018. Retrieved 12 December 2017.\n",
 791 |       "  5. **^** \"ICC postpones T20 World Cup due to Covid-19 pandemic\". _ESPNcricinfo_. 20 July 2020. Archived from the original on 23 October 2022. Retrieved 20 July 2020.\n",
 792 |       "  6. **^** \"Men's T20 World Cup postponed\" (Press release). Dubai: ICC. 20 July 2020. Archived from the original on 20 July 2020. Retrieved 20 July 2020.\n",
 793 |       "  7. ^ _**a**_ _**b**_ _**c**_ _**d**_ \"Match schedule announced for the ICC Men's Cricket World Cup 2023\". _ICC Cricket_. Archived from the original on 29 September 2023. Retrieved 27 June 2023.\n",
 794 |       "  8. **^** \"ICC Cricket World Cup 2023 Schedule Announced: India vs Pakistan on October 15 in Ahmedabad\". _Latestly_. 27 June 2023. Archived from the original on 27 June 2023. Retrieved 27 June 2023.\n",
 795 |       "  9. **^** \"Slow over-rate penalty – extra fielder inside circle to be introduced in ODIs too\". _ESPNcricinfo_. Archived from the original on 16 July 2023. Retrieved 24 October 2023.\n",
 796 |       "  10. **^** \"Pakistan could boycott 2023 50-over World Cup over India's Asia Cup stance\". 19 October 2022. Archived from the original on 9 May 2023. Retrieved 7 May 2023.\n",
 797 |       "  11. **^** \"India-Pakistan spat threatens Cricket World Cup\". 11 April 2023. Archived from the original on 23 October 2023. Retrieved 7 May 2023.\n",
 798 |       "  12. **^** \"2023 Asia Cup likely in Pakistan and one other overseas venue for India games\". _ESPNcricinfo_. Archived from the original on 24 March 2023. Retrieved 24 March 2023.\n",
 799 |       "  13. **^** \"Asia Cup 2023 to be played in Pakistan and Sri Lanka as ACC accepts hybrid model\". _Hindustan Times_. 15 June 2023. Archived from the original on 20 August 2023. Retrieved 27 June 2023.\n",
 800 |       "  14. **^** Rajput, Tanisha (6 September 2023). \"World Cup 2023 Full Squads: Check date, time, teams, venue, schedule and all you need to know\". _Wi_. Archived from the original on 6 September 2023. Retrieved 6 September 2023.\n",
 801 |       "  15. **^** Dutta, Rishab (3 September 2023). \"ICC World Cup 2023 Schedule, Teams, Venues, Prize Money, And Broadcast Channel\". _Sportsganga_. Archived from the original on 6 September 2023. Retrieved 6 September 2023.\n",
 802 |       "  16. **^** \"'Out of this world': In a first, ODI World Cup trophy tours space\". _The Times of India_. 27 June 2023. ISSN 0971-8257. Retrieved 15 February 2024.\n",
 803 |       "  17. **^** \"WATCH: ICC unveils mascots for ICC ODI World Cup 2023, fans to vote on names\". _The Times of India_. 19 August 2023. ISSN 0971-8257. Archived from the original on 20 August 2023. Retrieved 20 August 2023.\n",
 804 |       "  18. **^** \"Names of ICC mascots revealed ahead of World Cup\". _International Cricket Council_. 30 September 2023. Archived from the original on 8 October 2023. Retrieved 30 September 2023.\n",
 805 |       "  19. **^** Banerjee, Krishnendu (27 August 2023). \"World Cup Opening Ceremony on Oct 4, Motera Stadium to host ICC Captains Day\". _Inside Sport India_. Archived from the original on 28 August 2023. Retrieved 28 August 2023.\n",
 806 |       "  20. **^** \"Ranveer Singh and Pritam team up as Official Cricket World Cup 2023 anthem is released\". _International Cricket Council_. Archived from the original on 1 October 2023. Retrieved 20 September 2023.\n",
 807 |       "  21. **^** \"ICC roasted as fans reject World Cup 2023 anthem minutes after its launch\". _Hindustan Times_. 20 September 2023. Archived from the original on 2 October 2023. Retrieved 25 September 2023.\n",
 808 |       "  22. **^** Gupta, Sakshi (4 October 2023). \"Will there be an opening ceremony for the ICC Cricket World Cup 2023?\". _The Independent_. Archived from the original on 10 October 2023. Retrieved 4 October 2023.\n",
 809 |       "  23. **^** \"PM Modi, Australia's Deputy Prime Minister Richard Marles to attend India-Australia World Cup final\". _Hindustan Times_. ANI. 17 November 2023. Archived from the original on 18 November 2023. Retrieved 19 November 2023.\n",
 810 |       "  24. **^** Malhotra, Sahil (18 November 2023). \"Salute from the Skies, Parade of Champions, Drone Show and More… : World Cup Final Set to be a Grand Spectacle\". _News18_. Archived from the original on 18 November 2023. Retrieved 18 November 2023.\n",
 811 |       "  25. ^ _**a**_ _**b**_ Witney, Katya (6 July 2023). \"CWC Qualifier 2023: Netherlands qualify for World Cup at Scotland's expense after stunning Bas de Leede heist\". _Wisden_. London: Bloomsbury. Archived from the original on 9 July 2023. Retrieved 9 July 2023.\n",
 812 |       "  26. **^** \"Sri Lanka qualifies for Cricket World Cup; Zimbabwe, Scotland to scrap it out for final place\". _AP News_. New York: Associated Press. 2 July 2023. Archived from the original on 6 July 2023. Retrieved 6 July 2023.\n",
 813 |       "  27. **^** \"West Indies Officially Eliminated from 2023 World Cup Race After Thumping Loss to Scotland in Historic Low\". _Wisden_. London: Bloomsbury. 1 July 2023. Archived from the original on 4 July 2023. Retrieved 4 July 2023.\n",
 814 |       "  28. **^** \"Cricket World Cup venues to get an upgrade: Imported grass, new outfields, better floodlights\". _The Indian Express_. 30 June 2023. Archived from the original on 11 July 2023. Retrieved 30 June 2023.\n",
 815 |       "  29. **^** \"ODI World Cup: More grass, bigger boundaries to tackle dew factor\". _The Times of India_. 20 September 2023. ISSN 0971-8257. Archived from the original on 30 September 2023. Retrieved 5 October 2023.\n",
 816 |       "  30. **^** \"ICC Men's Cricket World Cup 2023 – Media Guide\" (PDF). ICC. Archived (PDF) from the original on 20 November 2023. Retrieved 19 November 2023.\n",
 817 |       "  31. **^** \"ICC World Cup 2023: All the squads for ICC Men's Cricket World Cup 2023\". _ICC_. 7 August 2023. Archived from the original on 8 February 2020. Retrieved 7 August 2023.\n",
 818 |       "  32. **^** \"All the squads for ICC Men's Cricket World Cup 2023\". _International Cricket Council_. Archived from the original on 4 October 2023. Retrieved 26 September 2023.\n",
 819 |       "  33. **^** \"ICC World Cup 2023: Youngest and oldest squads and players in the tournament\". _Sportstar_. 4 October 2023. Archived from the original on 10 October 2023. Retrieved 8 October 2023.\n",
 820 |       "  34. **^** \"Match officials for the ICC Men's Cricket World Cup 2023 named\". _www.icc-cricket.com_. Retrieved 22 September 2023.\n",
 821 |       "  35. **^** \"ICC announce Match Officials for ICC Men's Cricket World Cup 2023\". _www.icc-cricket.com_. Retrieved 22 September 2023.\n",
 822 |       "  36. **^** \"Three India cities to host official World Cup warm-up fixtures\". _www.icc-cricket.com_. Archived from the original on 24 August 2023. Retrieved 23 August 2023.\n",
 823 |       "  37. **^** \"2023 ICC WC Full schedule, venues, time, teams and where to stream\". _The Hindu_. 27 June 2023. Archived from the original on 29 June 2023. Retrieved 28 June 2023.\n",
 824 |       "  38. **^** \"World Cup 2023 schedule: India to play a warm-up match against England, here are venues for practice games\". India TV News. 27 June 2023. Archived from the original on 27 June 2023. Retrieved 27 June 2023.\n",
 825 |       "  39. **^** \"India v Pakistan clash among nine World Cup fixtures rescheduled\". International Cricket Council. 9 August 2023. Archived from the original on 9 August 2023. Retrieved 9 August 2023.\n",
 826 |       "  40. **^** \"Pakistan to host 2025 Champions Trophy, announces ICC\". _Dawn_. 16 November 2021. Archived from the original on 12 October 2023. Retrieved 1 November 2023.\n",
 827 |       "  41. **^** \"2023 ICC WC Full schedule, venues, time, teams and where to stream\". _The Hindu_. 27 June 2023. Archived from the original on 29 June 2023. Retrieved 27 June 2023.\n",
 828 |       "  42. **^** \"Who are best-placed to join India in the semi-finals?\". _ESPNcricinfo_. Archived from the original on 2 November 2023. Retrieved 3 November 2023.\n",
 829 |       "  43. **^** \"Jadeja razes South Africa for 83 after Kohli scores 49th ODI ton\". _ESPNcricinfo_. 5 November 2023. Archived from the original on 5 November 2023. Retrieved 6 November 2023.\n",
 830 |       "  44. **^** \"Fabulous Fakhar pulls off stunning chase to keep Pakistan alive\". _ESPNcricinfo_. Archived from the original on 14 November 2023. Retrieved 4 November 2023.\n",
 831 |       "  45. **^** Sportstar, Team (7 November 2023). \"World Cup 2023: Australia qualifies for semifinals after stunning win over Afghanistan\". _Sportstar_. Archived from the original on 13 November 2023. Retrieved 8 November 2023.\n",
 832 |       "  46. **^** \"It's official! India set up 2023 World Cup semi-final against New Zealand in 2019 rematch; Pakistan knocked out\". _Hindustan Times_. 11 November 2023. Archived from the original on 14 November 2023. Retrieved 12 November 2023.\n",
 833 |       "  47. **^** \"2023 World Cup Cricket Batting Records & Stats runs\". _ESPNcricinfo_. Archived from the original on 18 October 2023. Retrieved 19 October 2023.\n",
 834 |       "  48. **^** \"2023 World Cup Cricket bowling Records & Stats wickets\". _ESPNcricinfo_. Archived from the original on 9 October 2023. Retrieved 10 October 2023.\n",
 835 |       "  49. **^** \"India star named Player of the Tournament at ICC Men's Cricket World Cup\". _Cricket World Cup_. Archived from the original on 19 November 2023. Retrieved 19 November 2023.\n",
 836 |       "  50. **^** \"Finalists dominate Cricket World Cup Team of the Tournament\". _Cricket World Cup_. Archived from the original on 21 November 2023. Retrieved 21 November 2023.\n",
 837 |       "  51. ^ _**a**_ _**b**_ Strauss, Will (26 September 2023). \"Cricket World Cup 2023: ICC TV to produce vertical feed of all matches in India\". _SVG Europe_. Archived from the original on 7 October 2023. Retrieved 6 October 2023.\n",
 838 |       "  52. **^** Banerjee, Krishnendu (4 October 2023). \"World Cup LIVE Streaming in 9 languages with 120 commentators on Disney+ Hotstar\". _Inside Sport_. Archived from the original on 3 October 2023. Retrieved 4 October 2023.\n",
 839 |       "Metadata: {'source': 'https://en.wikipedia.org/wiki/2023_Cricket_World_Cup', 'category': 'cricket world cup', 'extracted_metadata': {'player_1': '', 'player_2': '', 'player_3': '', 'player_4': '', 'player_5': '', 'team_1': 'India', 'team_2': 'Pakistan', 'team_3': 'Australia', 'team_4': 'New Zealand', 'team_5': 'South Africa', 'keyword_1': 'World Cup', 'keyword_2': 'ODI', 'keyword_3': 'T20', 'keyword_4': 'Asia Cup', 'keyword_5': 'ICC'}}\n",
 840 |       "Document: Main article: 2023 Cricket World Cup knockout stage\n",
 841 |       "\n",
 842 |       "The host India was the first team to qualify for the semi-finals after their\n",
 843 |       "302-run win against Sri Lanka, their seventh successive win in the World\n",
 844 |       "Cup.[42] India secured the top place amongst the semi-finalists after they\n",
 845 |       "beat South Africa by 243 runs on 5 November at Eden Gardens in Kolkata.[43]\n",
 846 |       "\n",
 847 |       "South Africa became the second team to qualify for the semi-finals after\n",
 848 |       "Pakistan defeated New Zealand on 4 November,[44] with Australia becoming the\n",
 849 |       "third team to qualify after defeating Afghanistan on 7 November.[45] New\n",
 850 |       "Zealand confirmed their berth as the fourth team after Pakistan lost their\n",
 851 |       "final match against England.[46]\n",
 852 |       "\n",
 853 |       "| **Semi-finals** | | | **Final** |   \n",
 854 |       "---|---|---|---|---|---  \n",
 855 |       "|  |  |  | | |  |  |  |   \n",
 856 |       "| **1** | **India** | **397/4 (50 overs)** | |   \n",
 857 |       "| 4 |  New Zealand | 327 (48.5 overs) |   \n",
 858 |       "| | SFW1 |  India | 240 (50 overs) |   \n",
 859 |       "|  |   \n",
 860 |       "| | | **SFW2** | **Australia** | **241/4 (43 overs)** |   \n",
 861 |       "| 2 |  South Africa | 212 (49.4 overs) |   \n",
 862 |       "|  \n",
 863 |       "| **3** | **Australia** | **215/7 (47.2 overs)** |   \n",
 864 |       "  \n",
 865 |       "### Semi-finals\n",
 866 |       "\n",
 867 |       "15 November 2023  \n",
 868 |       "14:00 (D/N)  \n",
 869 |       "Scorecard  \n",
 870 |       "---  \n",
 871 |       "**India **  \n",
 872 |       "397/4 (50 overs) | **v** | **New Zealand**  \n",
 873 |       "327 (48.5 overs)  \n",
 874 |       "---|---|---  \n",
 875 |       "|  |   \n",
 876 |       "**India won by 70 runs**  \n",
 877 |       "Wankhede Stadium, Mumbai  \n",
 878 |       "---  \n",
 879 |       "  \n",
 880 |       "* * *\n",
 881 |       "\n",
 882 |       "16 November 2023  \n",
 883 |       "14:00 (D/N)  \n",
 884 |       "Scorecard  \n",
 885 |       "---  \n",
 886 |       "**South Africa **  \n",
 887 |       "212 (49.4 overs) | **v** | **Australia**  \n",
 888 |       "215/7 (47.2 overs)  \n",
 889 |       "---|---|---  \n",
 890 |       "|  |   \n",
 891 |       "**Australia won by 3 wickets**  \n",
 892 |       "Eden Gardens, Kolkata  \n",
 893 |       "---  \n",
 894 |       "  \n",
 895 |       "### Final\n",
 896 |       "\n",
 897 |       "Main article: 2023 Cricket World Cup final\n",
 898 |       "\n",
 899 |       "19 November 2023  \n",
 900 |       "14:00 (D/N)  \n",
 901 |       "Scorecard  \n",
 902 |       "---  \n",
 903 |       "**India **  \n",
 904 |       "240 (50 overs) | **v** | **Australia**  \n",
 905 |       "241/4 (43 overs)  \n",
 906 |       "---|---|---  \n",
 907 |       "|  |   \n",
 908 |       "**Australia won by 6 wickets**  \n",
 909 |       "Narendra Modi Stadium, Ahmedabad  \n",
 910 |       "---  \n",
 911 |       "  \n",
 912 |       "## Statistics\n",
 913 |       "\n",
 914 |       "Main article: 2023 Cricket World Cup statistics\n",
 915 |       "\n",
 916 |       "### Most runs\n",
 917 |       "\n",
 918 |       "Runs  | Player  | Team   \n",
 919 |       "---|---|---  \n",
 920 |       "765  | Virat Kohli |  India  \n",
 921 |       "597  | Rohit Sharma |  India  \n",
 922 |       "594  | Quinton de Kock |  South Africa  \n",
 923 |       "578  | Rachin Ravindra |  New Zealand  \n",
 924 |       "552  | Daryl Mitchell |  New Zealand  \n",
 925 |       "  \n",
 926 |       "  * Source: ESPNcricinfo[47]\n",
 927 |       "\n",
 928 |       "### Most wickets\n",
 929 |       "\n",
 930 |       "Wickets  | Player  | Team   \n",
 931 |       "---|---|---  \n",
 932 |       "24  | Mohammed Shami |  India  \n",
 933 |       "23  | Adam Zampa |  Australia  \n",
 934 |       "21  | Dilshan Madushanka |  Sri Lanka  \n",
 935 |       "20  | Jasprit Bumrah |  India  \n",
 936 |       "20  | Gerald Coetzee |  South Africa  \n",
 937 |       "  \n",
 938 |       "  * Source: ESPNcricinfo[48]\n",
 939 |       "\n",
 940 |       "### Team of the tournament\n",
 941 |       "\n",
 942 |       "The ICC announced its team of the tournament on 21 November 2023, with Virat\n",
 943 |       "Kohli being named as player of the tournament,[49] and Rohit Sharma as captain\n",
 944 |       "of the team.[50]\n",
 945 |       "\n",
 946 |       "Player  | Role   \n",
 947 |       "---|---  \n",
 948 |       "Quinton de Kock | Opening batsman / wicket-keeper   \n",
 949 |       "Rohit Sharma | Opening batsman / captain   \n",
 950 |       "Virat Kohli | Batsman   \n",
 951 |       "Daryl Mitchell | All-rounder   \n",
 952 |       "KL Rahul | Batsman   \n",
 953 |       "Glenn Maxwell | All-rounder   \n",
 954 |       "Ravindra Jadeja | All-rounder   \n",
 955 |       "Jasprit Bumrah | Bowler   \n",
 956 |       "Dilshan Madushanka | Bowler   \n",
 957 |       "Adam Zampa | Bowler   \n",
 958 |       "Mohammed Shami | Bowler   \n",
 959 |       "Gerald Coetzee | Twelfth man  \n",
 960 |       "  \n",
 961 |       "## Broadcasting\n",
 962 |       "\n",
 963 |       "Disney Star served as host broadcaster of the tournament in association with\n",
 964 |       "ICC TV;[51] in India, all matches were televised by Star Sports and streamed\n",
 965 |       "by Disney+ Hotstar, with coverage available in English and eight regional\n",
 966 |       "languages.[52] Amid competition with JioCinema for domestic cricket\n",
 967 |       "rights,[53][54] Disney announced that all matches would be available on\n",
 968 |       "Disney+ Hotstar for free on mobile devices. The broadcasts featured expanded\n",
 969 |       "player and Hawk-Eye ball tracking features (building upon those introduced in\n",
 970 |       "the 2022 men's T20 World Cup) for visualizing shots and fielding, and\n",
 971 |       "dedicated vertical video feeds of each match designed for streaming on\n",
 972 |       "smartphones. Commentary on the ICC's English-language world feed was led by\n",
 973 |       "Ricky Ponting and Eoin Morgan among others.[53][51]\n",
 974 |       "\n",
 975 |       "The ICC projected that global live viewing minutes of the tournament had\n",
 976 |       "increased by 17% over 2019.[55] In India, Broadcast Audience Research Council\n",
 977 |       "(BARC) ratings reported that the final was seen on linear television by 518\n",
 978 |       "million people in India, and Disney reported that streams peaked at 57 million\n",
 979 |       "concurrent viewers on Hotstar—both setting new records.[54]\n",
 980 |       "\n",
 981 |       "## See also\n",
 982 |       "\n",
 983 |       "  * List of Cricket World Cup finals\n",
 984 |       "\n",
 985 |       "## References\n",
 986 |       "Metadata: {'source': 'https://en.wikipedia.org/wiki/2023_Cricket_World_Cup', 'category': 'cricket world cup', 'extracted_metadata': {'player_1': 'Virat Kohli', 'player_2': 'Rohit Sharma', 'player_3': 'Quinton de Kock', 'player_4': 'Daryl Mitchell', 'player_5': 'Mohammed Shami', 'team_1': 'India', 'team_2': 'Australia', 'team_3': 'South Africa', 'team_4': 'New Zealand', 'team_5': 'Sri Lanka', 'keyword_1': '2023 Cricket World Cup', 'keyword_2': 'semi-finals', 'keyword_3': 'final', 'keyword_4': 'most runs', 'keyword_5': 'most wickets'}}\n",
 987 |       "Document: Jump to content\n",
 988 |       "\n",
 989 |       "Main menu\n",
 990 |       "\n",
 991 |       "Main menu\n",
 992 |       "\n",
 993 |       "move to sidebar hide\n",
 994 |       "\n",
 995 |       "Navigation\n",
 996 |       "\n",
 997 |       "  * Main page\n",
 998 |       "  * Contents\n",
 999 |       "  * Current events\n",
1000 |       "  * Random article\n",
1001 |       "  * About Wikipedia\n",
1002 |       "  * Contact us\n",
1003 |       "\n",
1004 |       "Contribute\n",
1005 |       "\n",
1006 |       "  * Help\n",
1007 |       "  * Learn to edit\n",
1008 |       "  * Community portal\n",
1009 |       "  * Recent changes\n",
1010 |       "  * Upload file\n",
1011 |       "  * Special pages\n",
1012 |       "\n",
1013 |       "Search\n",
1014 |       "\n",
1015 |       "Search\n",
1016 |       "\n",
1017 |       "Appearance\n",
1018 |       "\n",
1019 |       "  * Donate\n",
1020 |       "  * Create account\n",
1021 |       "  * Log in\n",
1022 |       "\n",
1023 |       "Personal tools\n",
1024 |       "\n",
1025 |       "  * Donate\n",
1026 |       "  * Create account\n",
1027 |       "  * Log in\n",
1028 |       "\n",
1029 |       "Pages for logged out editors learn more\n",
1030 |       "\n",
1031 |       "  * Contributions\n",
1032 |       "  * Talk\n",
1033 |       "\n",
1034 |       "## Contents\n",
1035 |       "\n",
1036 |       "move to sidebar hide\n",
1037 |       "\n",
1038 |       "  * (Top)\n",
1039 |       "\n",
1040 |       "  * 1 Background\n",
1041 |       "\n",
1042 |       "Toggle Background subsection\n",
1043 |       "\n",
1044 |       "    * 1.1 Host selection\n",
1045 |       "\n",
1046 |       "    * 1.2 COVID-19 pandemic\n",
1047 |       "\n",
1048 |       "    * 1.3 Format\n",
1049 |       "\n",
1050 |       "    * 1.4 Pakistan's participation\n",
1051 |       "\n",
1052 |       "    * 1.5 Prize money\n",
1053 |       "\n",
1054 |       "    * 1.6 Marketing\n",
1055 |       "\n",
1056 |       "  * 2 Qualification\n",
1057 |       "\n",
1058 |       "  * 3 Venues\n",
1059 |       "\n",
1060 |       "  * 4 Squads\n",
1061 |       "\n",
1062 |       "  * 5 Match officials\n",
1063 |       "\n",
1064 |       "  * 6 Warm-up matches\n",
1065 |       "\n",
1066 |       "  * 7 Group stage\n",
1067 |       "\n",
1068 |       "Toggle Group stage subsection\n",
1069 |       "\n",
1070 |       "    * 7.1 Points table\n",
1071 |       "\n",
1072 |       "    * 7.2 Results\n",
1073 |       "\n",
1074 |       "  * 8 Knockout stage\n",
1075 |       "\n",
1076 |       "Toggle Knockout stage subsection\n",
1077 |       "\n",
1078 |       "    * 8.1 Semi-finals\n",
1079 |       "\n",
1080 |       "    * 8.2 Final\n",
1081 |       "\n",
1082 |       "  * 9 Statistics\n",
1083 |       "\n",
1084 |       "Toggle Statistics subsection\n",
1085 |       "\n",
1086 |       "    * 9.1 Most runs\n",
1087 |       "\n",
1088 |       "    * 9.2 Most wickets\n",
1089 |       "\n",
1090 |       "    * 9.3 Team of the tournament\n",
1091 |       "\n",
1092 |       "  * 10 Broadcasting\n",
1093 |       "\n",
1094 |       "  * 11 See also\n",
1095 |       "\n",
1096 |       "  * 12 References\n",
1097 |       "\n",
1098 |       "  * 13 External links\n",
1099 |       "\n",
1100 |       "Toggle the table of contents\n",
1101 |       "\n",
1102 |       "# 2023 Cricket World Cup\n",
1103 |       "\n",
1104 |       "33 languages\n",
1105 |       "\n",
1106 |       "  * Afrikaans\n",
1107 |       "  * العربية\n",
1108 |       "  * অসমীয়া\n",
1109 |       "  * বাংলা\n",
1110 |       "  * Deutsch\n",
1111 |       "  * Español\n",
1112 |       "  * Français\n",
1113 |       "  * ગુજરાતી\n",
1114 |       "  * हिन्दी\n",
1115 |       "  * Bahasa Indonesia\n",
1116 |       "  * Italiano\n",
1117 |       "  * ಕನ್ನಡ\n",
1118 |       "  * कॉशुर / کٲشُر\n",
1119 |       "  * മലയാളം\n",
1120 |       "  * मराठी\n",
1121 |       "  * مصرى\n",
1122 |       "  * Nederlands\n",
1123 |       "  * नेपाली\n",
1124 |       "  * 日本語\n",
1125 |       "  * Oʻzbekcha / ўзбекча\n",
1126 |       "  * ਪੰਜਾਬੀ\n",
1127 |       "  * پنجابی\n",
1128 |       "  * Português\n",
1129 |       "  * Русский\n",
1130 |       "  * ᱥᱟᱱᱛᱟᱲᱤ\n",
1131 |       "  * සිංහල\n",
1132 |       "  * Simple English\n",
1133 |       "  * سنڌي\n",
1134 |       "  * தமிழ்\n",
1135 |       "  * తెలుగు\n",
1136 |       "  * Українська\n",
1137 |       "  * اردو\n",
1138 |       "  * 中文\n",
1139 |       "\n",
1140 |       "Edit links\n",
1141 |       "\n",
1142 |       "  * Article\n",
1143 |       "  * Talk\n",
1144 |       "\n",
1145 |       "English\n",
1146 |       "\n",
1147 |       "  * Read\n",
1148 |       "  * View source\n",
1149 |       "  * View history\n",
1150 |       "\n",
1151 |       "Tools\n",
1152 |       "\n",
1153 |       "Tools\n",
1154 |       "\n",
1155 |       "move to sidebar hide\n",
1156 |       "\n",
1157 |       "Actions\n",
1158 |       "\n",
1159 |       "  * Read\n",
1160 |       "  * View source\n",
1161 |       "  * View history\n",
1162 |       "\n",
1163 |       "General\n",
1164 |       "\n",
1165 |       "  * What links here\n",
1166 |       "  * Related changes\n",
1167 |       "  * Upload file\n",
1168 |       "  * Permanent link\n",
1169 |       "  * Page information\n",
1170 |       "  * Cite this page\n",
1171 |       "  * Get shortened URL\n",
1172 |       "  * Download QR code\n",
1173 |       "\n",
1174 |       "Print/export\n",
1175 |       "\n",
1176 |       "  * Download as PDF\n",
1177 |       "  * Printable version\n",
1178 |       "\n",
1179 |       "In other projects\n",
1180 |       "\n",
1181 |       "  * Wikimedia Commons\n",
1182 |       "  * Wikidata item\n",
1183 |       "\n",
1184 |       "Appearance\n",
1185 |       "\n",
1186 |       "move to sidebar hide\n",
1187 |       "\n",
1188 |       "From Wikipedia, the free encyclopedia\n",
1189 |       "\n",
1190 |       "13th edition of the ICC Men's Cricket World Cup\n",
1191 |       "\n",
1192 |       "Cricket tournament\n",
1193 |       "\n",
1194 |       "2023 ICC Men's Cricket World Cup  \n",
1195 |       "---  \n",
1196 |       "Dates| 5 October – 19 November 2023  \n",
1197 |       "Administrator(s)| International Cricket Council  \n",
1198 |       "Cricket format| One Day International (ODI)  \n",
1199 |       "Tournament format(s)| Round-robin and knockout  \n",
1200 |       "Host(s)| India  \n",
1201 |       "Champions|  Australia (6th title)  \n",
1202 |       "Runners-up|  India  \n",
1203 |       "Participants| 10  \n",
1204 |       "Matches| 48  \n",
1205 |       "Attendance| 1,250,307 (26,048 per match)  \n",
1206 |       "Player of the series|  Virat Kohli  \n",
1207 |       "Most runs|  Virat Kohli (765)  \n",
1208 |       "Most wickets|  Mohammed Shami (24)  \n",
1209 |       "Official website| cricketworldcup.com  \n",
1210 |       "<- 2019 _2027_ ->  \n",
1211 |       "**Part of a series on the**  \n",
1212 |       "---  \n",
1213 |       "2023 Cricket World Cup /  \n",
1214 |       "2025 ICC Champions Trophy  \n",
1215 |       "CWC:  Category •  Commons  \n",
1216 |       "CT:  Category •  Commons  \n",
1217 |       "2023 Cricket World Cup  \n",
1218 |       "Background\n",
1219 |       "\n",
1220 |       "  * Host selection\n",
1221 |       "  * COVID-19 pandemic\n",
1222 |       "  * Format\n",
1223 |       "  * Pakistan's participation\n",
1224 |       "  * Prize money\n",
1225 |       "  * Marketing\n",
1226 |       "\n",
1227 |       "  \n",
1228 |       "Stages\n",
1229 |       "\n",
1230 |       "  * Warm-up matches\n",
1231 |       "\n",
1232 |       "  * Group stage\n",
1233 |       "\n",
1234 |       "  * Knockout stage \n",
1235 |       "    * Semi-finals\n",
1236 |       "    * Final\n",
1237 |       "\n",
1238 |       "  \n",
1239 |       "General Information\n",
1240 |       "\n",
1241 |       "  * Officials\n",
1242 |       "  * Squads\n",
1243 |       "  * Statistics\n",
1244 |       "  * Venues\n",
1245 |       "\n",
1246 |       "  \n",
1247 |       "CWC Qualification  \n",
1248 |       "Overview  \n",
1249 |       "Super League\n",
1250 |       "\n",
1251 |       "  * 2020–2023 Super League\n",
1252 |       "\n",
1253 |       "  * 2020 \n",
1254 |       "    * AUS v ENG\n",
1255 |       "    * NED v ZIM\n",
1256 |       "    * IRE v ENG\n",
1257 |       "\n",
1258 |       "  * 2020–21 \n",
1259 |       "    * ENG v IND\n",
1260 |       "    * IND v AUS\n",
1261 |       "    * BAN v NZ\n",
1262 |       "    * PAK v SA\n",
1263 |       "    * ZIM v PAK\n",
1264 |       "    * ZIM v SL\n",
1265 |       "    * ENG v SA\n",
1266 |       "    * WIN v BAN\n",
1267 |       "    * IRE v AFG in UAE\n",
1268 |       "    * SL v WIN\n",
1269 |       "\n",
1270 |       "  * 2021 \n",
1271 |       "    * SL v BAN\n",
1272 |       "    * IRE v NED\n",
1273 |       "    * AUS v WIN\n",
1274 |       "    * BAN v ZIM\n",
1275 |       "    * SL v ENG\n",
1276 |       "    * PAK v ENG\n",
1277 |       "    * SA v IRE\n",
1278 |       "    * IND v SL\n",
1279 |       "    * AFG v SL\n",
1280 |       "    * ZIM v IRE\n",
1281 |       "\n",
1282 |       "  * 2021–22 \n",
1283 |       "    * AFG v PAK\n",
1284 |       "    * SA v SL\n",
1285 |       "    * ZIM v SL\n",
1286 |       "    * WIN v IND\n",
1287 |       "    * AFG v BAN\n",
1288 |       "    * AUS v PAK\n",
1289 |       "    * BAN v SA\n",
1290 |       "    * NED v SA\n",
1291 |       "    * NED v NZ\n",
1292 |       "    * IRE v WIN\n",
1293 |       "    * NED v AFG in Qatar\n",
1294 |       "    * AFG v IND\n",
1295 |       "\n",
1296 |       "  * 2022 \n",
1297 |       "    * NZ v IRE\n",
1298 |       "    * WIN v NED\n",
1299 |       "    * ENG v NED\n",
1300 |       "    * WIN v PAK\n",
1301 |       "    * PAK v NED\n",
1302 |       "    * AFG v ZIM\n",
1303 |       "    * NZ v WIN\n",
1304 |       "    * IND v ZIM\n",
1305 |       "    * ZIM v AUS\n",
1306 |       "\n",
1307 |       "  * 2022–23 \n",
1308 |       "    * NZ v AUS\n",
1309 |       "    * SA v IND\n",
1310 |       "    * AFG v SL\n",
1311 |       "    * NZ v PAK\n",
1312 |       "    * SA v AUS\n",
1313 |       "    * ENG v SA\n",
1314 |       "    * ENG v BAN\n",
1315 |       "    * SL v NZ\n",
1316 |       "    * NED v ZIM\n",
1317 |       "\n",
1318 |       "  * 2023 \n",
1319 |       "    * BAN v IRE\n",
1320 |       "\n",
1321 |       "  \n",
1322 |       "League 2\n",
1323 |       "\n",
1324 |       "  * 2019–2023 League 2\n",
1325 |       "\n",
1326 |       "  * 2019 \n",
1327 |       "    * 1\n",
1328 |       "    * 2\n",
1329 |       "    * 3\n",
1330 |       "\n",
1331 |       "  * 2020 \n",
1332 |       "    * 4\n",
1333 |       "    * 5\n",
1334 |       "\n",
1335 |       "  * 2021 \n",
1336 |       "    * 6\n",
1337 |       "    * 7\n",
1338 |       "    * 8\n",
1339 |       "\n",
1340 |       "  * 2022 \n",
1341 |       "    * N/A\n",
1342 |       "    * 9\n",
1343 |       "    * 10\n",
1344 |       "    * 11\n",
1345 |       "    * 12\n",
1346 |       "    * 13\n",
1347 |       "    * 14\n",
1348 |       "    * 15\n",
1349 |       "    * 16\n",
1350 |       "    * 17\n",
1351 |       "    * 18\n",
1352 |       "\n",
1353 |       "  * 2023 \n",
1354 |       "    * 19\n",
1355 |       "    * N/A\n",
1356 |       "    * 20\n",
1357 |       "    * 21\n",
1358 |       "\n",
1359 |       "  \n",
1360 |       "Challenge League\n",
1361 |       "\n",
1362 |       "  * 2019–2022 Challenge League\n",
1363 |       "\n",
1364 |       "  * A \n",
1365 |       "    * 2019\n",
1366 |       "    * 2021 (2022)\n",
1367 |       "    * 2020 (2022)\n",
1368 |       "\n",
1369 |       "  * B \n",
1370 |       "    * 2019\n",
1371 |       "    * 2020 (2022)\n",
1372 |       "    * 2021 (2022)\n",
1373 |       "\n",
1374 |       "  \n",
1375 |       "CWC Qualifier\n",
1376 |       "\n",
1377 |       "  * 2023 Qualifier Play-off\n",
1378 |       "  * 2023 Qualifier\n",
1379 |       "\n",
1380 |       "  \n",
1381 |       "2025 ICC Champions Trophy  \n",
1382 |       "Background\n",
1383 |       "\n",
1384 |       "  * Host selection\n",
1385 |       "  * Format\n",
1386 |       "  * India's participation\n",
1387 |       "  * Prize money\n",
1388 |       "  * Marketing\n",
1389 |       "\n",
1390 |       "  \n",
1391 |       "Stages\n",
1392 |       "\n",
1393 |       "  * Warm-up matches\n",
1394 |       "\n",
1395 |       "  * Group stage \n",
1396 |       "    * Group A\n",
1397 |       "    * Group B\n",
1398 |       "\n",
1399 |       "  * Knockout stage \n",
1400 |       "    * Semi-finals\n",
1401 |       "    * Final\n",
1402 |       "\n",
1403 |       "  \n",
1404 |       "General Information\n",
1405 |       "\n",
1406 |       "  * Officials\n",
1407 |       "  * Squads\n",
1408 |       "  * Statistics\n",
1409 |       "  * Venues\n",
1410 |       "\n",
1411 |       "  \n",
1412 |       "← 2019 CWC 2027 →  \n",
1413 |       "← 2017 CT 2029 →  \n",
1414 |       "  \n",
1415 |       "  * v\n",
1416 |       "  * t\n",
1417 |       "  * e\n",
1418 |       "\n",
1419 |       "  \n",
1420 |       "  \n",
1421 |       "The **2023 ICC Men's Cricket World Cup** was the 13th edition of the ICC Men's\n",
1422 |       "Cricket World Cup, a quadrennial One Day International (ODI) cricket\n",
1423 |       "tournament organized by the International Cricket Council (ICC). It was hosted\n",
1424 |       "from 5 October to 19 November 2023 across ten venues in India. This was the\n",
1425 |       "fourth World Cup held in India, but the first where India was the sole host.\n",
1426 |       "\n",
1427 |       "The tournament was contested by ten national teams, maintaining the same\n",
1428 |       "format used in 2019. After six weeks of round-robin matches, India, South\n",
1429 |       "Africa, Australia, and New Zealand finished as the top four and qualified for\n",
1430 |       "the knockout stage. In the knockout stage, India and Australia beat New\n",
1431 |       "Zealand and South Africa, respectively, to advance to the final, played on 19\n",
1432 |       "November at the Narendra Modi Stadium in Ahmedabad. Australia won the final by\n",
1433 |       "six wickets, winning their sixth Cricket World Cup title.\n",
1434 |       "\n",
1435 |       "Virat Kohli was named the player of the tournament and also scored the most\n",
1436 |       "runs, while Mohammed Shami was the leading wicket-taker. A total of 1,250,307\n",
1437 |       "spectators attended the matches, the highest number in any Cricket World Cup\n",
1438 |       "to date.[1] The tournament final set viewership records in India, drawing 518\n",
1439 |       "million viewers, with a peak of 57 million streaming viewers.\n",
1440 |       "\n",
1441 |       "## Background\n",
1442 |       "\n",
1443 |       "### Host selection\n",
1444 |       "\n",
1445 |       "On 11 December 2017, India was announced by the ICC as hosts of the 2023\n",
1446 |       "Cricket World Cup; while India had served as a co-host during three previous\n",
1447 |       "tournaments (most recently in 2011, which it co-hosted with Sri Lanka and\n",
1448 |       "Bangladesh), it would mark the first Cricket World Cup to be hosted solely by\n",
1449 |       "India.[2]\n",
1450 |       "\n",
1451 |       "### COVID-19 pandemic\n",
1452 |       "\n",
1453 |       "Further information: Impact of the COVID-19 pandemic on cricket\n",
1454 |       "\n",
1455 |       "Originally, the competition was to be played from 9 February to 26 March\n",
1456 |       "2023.[3][4] In July 2020 it was announced that due to the disruption of the\n",
1457 |       "qualification schedule by the COVID-19 pandemic, the start of the tournament\n",
1458 |       "would be delayed to October.[5][6] The ICC released the tournament schedule on\n",
1459 |       "27 June 2023.[7][8]\n",
1460 |       "\n",
1461 |       "### Format\n",
1462 |       "\n",
1463 |       "This was the first ICC World Cup in which penalties for slow over-rates were\n",
1464 |       "given to bowling sides if they did not complete their 50 overs in the\n",
1465 |       "stipulated time. On-field umpires could penalise the bowling team by not\n",
1466 |       "allowing more than four fielders outside the 30-yard circle.[9]\n",
1467 |       "\n",
1468 |       "### Pakistan's participation\n",
1469 |       "\n",
1470 |       "The Pakistan Cricket Board (PCB) had threatened to boycott the tournament\n",
1471 |       "after the Board of Control for Cricket in India (BCCI) refused to send a team\n",
1472 |       "to the 2023 Asia Cup scheduled in Pakistan.[10][11] This issue was resolved in\n",
1473 |       "June 2023 after the Asian Cricket Council announced that the tournament would\n",
1474 |       "be hosted using a hybrid model proposed by the PCB, with nine of the 13\n",
1475 |       "matches in the competition played in Sri Lanka.[12][13]\n",
1476 |       "\n",
1477 |       "### Prize money\n",
1478 |       "\n",
1479 |       "The ICC allocated a pool of US$10 million in prize money for the tournament,\n",
1480 |       "with payouts remaining the same as the 2019 and 2015 tournaments. Australia,\n",
1481 |       "the winning team, received US$4,000,000, the runner-up $2,000,000 and the\n",
1482 |       "losing semi-finalists $1,600,000. Teams that did not progress past the league\n",
1483 |       "stage received $100,000 and the winner of each league stage match received\n",
1484 |       "$40,000.[14][15]\n",
1485 |       "\n",
1486 |       "### Marketing\n",
1487 |       "\n",
1488 |       "The ICC hosted a trophy tour for 100 days prior to the tournament beginning 27\n",
1489 |       "June, with the Cricket World Cup Trophy being taken to various locations\n",
1490 |       "around the world. The event began with the launching of the trophy into the\n",
1491 |       "stratosphere by Sent Into Space and landing at Modi Stadium—becoming the first\n",
1492 |       "sports trophy to have ever been sent into space.[16] The ICC officially\n",
1493 |       "announced the mascots for the World Cup in August. The mascots were a male and\n",
1494 |       "female duo named \"Tonk\" and \"Blaze\" from the fictional cricketing utopia\n",
1495 |       "\"Crictoverse\".[17][18]\n",
1496 |       "\n",
1497 |       "Ahead of the tournament, it was reported that an opening ceremony would take\n",
1498 |       "place on 4 October 2023 at the Narendra Modi Stadium in Ahmedabad, a day\n",
1499 |       "before the opening match at the same venue.[19] The official theme song of the\n",
1500 |       "2023 Cricket World Cup titled \"Dil Jashn Bole\" (transl. Heart say celebrate)\n",
1501 |       "was released on 20 September. The song was composed by Pritam, and was sung by\n",
1502 |       "Pritam, Nakash Aziz, Sreerama Chandra, Amit Mishra, Jonita Gandhi, Akasa Singh\n",
1503 |       "and S. P. Charan.[20] However, the song was subject to backlash and bad\n",
1504 |       "reviews.[21] The opening ceremony was cancelled and replaced by a closing\n",
1505 |       "ceremony ahead of the final.[22] During this a drone show was held.[23][24]\n",
1506 |       "\n",
1507 |       "## Qualification\n",
1508 |       "\n",
1509 |       "Highlighted are the countries that participated in the 2023 Cricket World Cup.\n",
1510 |       "\n",
1511 |       "Qualified as host\n",
1512 |       "\n",
1513 |       "Qualified via the 2020–2023 Super League\n",
1514 |       "\n",
1515 |       "Qualified via the 2023 Qualifier\n",
1516 |       "\n",
1517 |       "Participated in the qualifier but failed to qualify\n",
1518 |       "Metadata: {'source': 'https://en.wikipedia.org/wiki/2023_Cricket_World_Cup', 'category': 'cricket world cup', 'extracted_metadata': {'player_1': 'Virat Kohli', 'player_2': 'Mohammed Shami', 'player_3': '', 'player_4': '', 'player_5': '', 'team_1': 'Australia', 'team_2': 'India', 'team_3': 'New Zealand', 'team_4': 'South Africa', 'team_5': '', 'keyword_1': '2023 Cricket World Cup', 'keyword_2': 'One Day International', 'keyword_3': 'International Cricket Council', 'keyword_4': 'Knockout stage', 'keyword_5': 'Prize money'}}\n"
1519 |      ]
1520 |     }
1521 |    ],
1522 |    "source": [
1523 |     "import faiss\n",
1524 |     "from langchain_community.vectorstores import FAISS\n",
1525 |     "from langchain_community.docstore.in_memory import InMemoryDocstore\n",
1526 |     "from langchain_openai import OpenAIEmbeddings\n",
1527 |     "from langchain_core.documents import Document\n",
1528 |     "from langchain_community.document_loaders import AsyncHtmlLoader\n",
1529 |     "from langchain_community.document_transformers import Html2TextTransformer\n",
1530 |     "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
1531 |     "from openai import OpenAI\n",
1532 |     "from langchain_openai import ChatOpenAI\n",
1533 |     "\n",
1534 |     "# Initialize the OpenAI client\n",
1535 |     "client = OpenAI()\n",
1536 |     "\n",
1537 |     "# Function to extract fixed metadata using GPT-4o-mini with JSON response\n",
1538 |     "def extract_fixed_metadata_from_chunk(chunk_text):\n",
1539 |     "    prompt = f\"\"\"\n",
1540 |     "    Extract the following fixed metadata in JSON format from the given text:\n",
1541 |     "    {{\n",
1542 |     "      \"player_1\": \"\",\n",
1543 |     "      \"player_2\": \"\",\n",
1544 |     "      \"player_3\": \"\",\n",
1545 |     "      \"player_4\": \"\",\n",
1546 |     "      \"player_5\": \"\",\n",
1547 |     "      \"team_1\": \"\",\n",
1548 |     "      \"team_2\": \"\",\n",
1549 |     "      \"team_3\": \"\",\n",
1550 |     "      \"team_4\": \"\",\n",
1551 |     "      \"team_5\": \"\",\n",
1552 |     "      \"keyword_1\": \"\",\n",
1553 |     "      \"keyword_2\": \"\",\n",
1554 |     "      \"keyword_3\": \"\",\n",
1555 |     "      \"keyword_4\": \"\",\n",
1556 |     "      \"keyword_5\": \"\"\n",
1557 |     "    }}\n",
1558 |     "    Here's the text:\n",
1559 |     "    {chunk_text}\n",
1560 |     "    \"\"\"\n",
1561 |     "\n",
1562 |     "    llm = ChatOpenAI(\n",
1563 |     "    model=\"gpt-4o-mini\",\n",
1564 |     "    temperature=0,\n",
1565 |     "    max_tokens=None,\n",
1566 |     "    timeout=None,\n",
1567 |     "    max_retries=2\n",
1568 |     "    )\n",
1569 |     "\n",
1570 |     "    json_llm = llm.bind(response_format={\"type\": \"json_object\"})\n",
1571 |     "\n",
1572 |     "\n",
1573 |     "    #Craft the prompt message\n",
1574 |     "    messages=[(\"human\",prompt)]\n",
1575 |     "\n",
1576 |     "\n",
1577 |     "    # Invoke the LLM\n",
1578 |     "    ai_msg = json_llm.invoke(messages)\n",
1579 |     "    \n",
1580 |     "\n",
1581 |     "    \n",
1582 |     "    # Extract the response in JSON format\n",
1583 |     "    metadata_response = ai_msg.content\n",
1584 |     "    print(metadata_response)\n",
1585 |     "    try:\n",
1586 |     "        # Convert the response into a dictionary\n",
1587 |     "        metadata = eval(metadata_response)  # This ensures it is a valid dictionary\n",
1588 |     "    except Exception as e:\n",
1589 |     "        print(f\"Error parsing metadata: {e}\")\n",
1590 |     "        metadata = {\n",
1591 |     "            \"player_1\": \"\", \"player_2\": \"\", \"player_3\": \"\", \"player_4\": \"\", \"player_5\": \"\",\n",
1592 |     "            \"team_1\": \"\", \"team_2\": \"\", \"team_3\": \"\", \"team_4\": \"\", \"team_5\": \"\",\n",
1593 |     "            \"keyword_1\": \"\", \"keyword_2\": \"\", \"keyword_3\": \"\", \"keyword_4\": \"\", \"keyword_5\": \"\"\n",
1594 |     "        }\n",
1595 |     "    return metadata\n",
1596 |     "\n",
1597 |     "# Step 1: Load data from a URL (Wikipedia page)\n",
1598 |     "url = \"https://en.wikipedia.org/wiki/2023_Cricket_World_Cup\"\n",
1599 |     "loader = AsyncHtmlLoader(url)\n",
1600 |     "data = loader.load()\n",
1601 |     "\n",
1602 |     "# Step 2: Transform the HTML content to plain text\n",
1603 |     "html2text = Html2TextTransformer()\n",
1604 |     "data_transformed = html2text.transform_documents(data)\n",
1605 |     "\n",
1606 |     "# Step 3: Split the text into smaller chunks using RecursiveCharacterTextSplitter\n",
1607 |     "text_splitter = RecursiveCharacterTextSplitter(\n",
1608 |     "    chunk_size=10000,  # Number of characters in each chunk\n",
1609 |     "    chunk_overlap=200  # Number of overlapping characters between chunks\n",
1610 |     ")\n",
1611 |     "chunks = text_splitter.split_text(data_transformed[0].page_content)\n",
1612 |     "\n",
1613 |     "# Step 4: Initialize OpenAI Embeddings model\n",
1614 |     "embedding_model = OpenAIEmbeddings(model=\"text-embedding-3-small\")\n",
1615 |     "\n",
1616 |     "# Step 5: Initialize FAISS index for L2 (Euclidean) distance\n",
1617 |     "embedding_dim = len(embedding_model.embed_query(\"hello world\"))\n",
1618 |     "index = faiss.IndexFlatL2(embedding_dim)\n",
1619 |     "\n",
1620 |     "# Step 6: Initialize the InMemoryDocstore to store documents and metadata in memory\n",
1621 |     "docstore = InMemoryDocstore()\n",
1622 |     "\n",
1623 |     "# Step 7: Create FAISS vector store using the embedding function, FAISS index, and docstore\n",
1624 |     "vector_store = FAISS(\n",
1625 |     "    embedding_function=embedding_model,\n",
1626 |     "    index=index,\n",
1627 |     "    docstore=docstore,\n",
1628 |     "    index_to_docstore_id={}\n",
1629 |     ")\n",
1630 |     "\n",
1631 |     "# Step 8: Add chunks (documents) with extracted metadata and embeddings to FAISS vector store\n",
1632 |     "documents = []\n",
1633 |     "for i, chunk in enumerate(chunks):\n",
1634 |     "    # Extract fixed metadata using the LLM\n",
1635 |     "    extracted_metadata = extract_fixed_metadata_from_chunk(chunk)\n",
1636 |     "    \n",
1637 |     "    # Create a document object with both the chunk content and the extracted metadata\n",
1638 |     "    document = Document(\n",
1639 |     "        page_content=chunk, \n",
1640 |     "        metadata={\n",
1641 |     "            \"source\": url, \n",
1642 |     "            \"category\": \"cricket world cup\",\n",
1643 |     "            \"extracted_metadata\": extracted_metadata  # Store the structured metadata\n",
1644 |     "        }\n",
1645 |     "    )\n",
1646 |     "    \n",
1647 |     "    # Append the document to the list\n",
1648 |     "    documents.append(document)\n",
1649 |     "\n",
1650 |     "# Create unique IDs for each chunk\n",
1651 |     "ids = [f\"chunk_{i}\" for i in range(len(chunks))]\n",
1652 |     "\n",
1653 |     "# Add the documents and their embeddings to the FAISS vector store\n",
1654 |     "vector_store.add_documents(documents=documents, ids=ids)\n",
1655 |     "\n",
1656 |     "# Step 9: Define a function to extract metadata from a query\n",
1657 |     "def extract_fixed_metadata_from_query(query_text):\n",
1658 |     "    prompt = f\"\"\"\n",
1659 |     "    Extract the following fixed metadata in JSON format from the query:\n",
1660 |     "    {{\n",
1661 |     "      \"player_1\": \"\",\n",
1662 |     "      \"player_2\": \"\",\n",
1663 |     "      \"player_3\": \"\",\n",
1664 |     "      \"player_4\": \"\",\n",
1665 |     "      \"player_5\": \"\",\n",
1666 |     "      \"team_1\": \"\",\n",
1667 |     "      \"team_2\": \"\",\n",
1668 |     "      \"team_3\": \"\",\n",
1669 |     "      \"team_4\": \"\",\n",
1670 |     "      \"team_5\": \"\",\n",
1671 |     "      \"keyword_1\": \"\",\n",
1672 |     "      \"keyword_2\": \"\",\n",
1673 |     "      \"keyword_3\": \"\",\n",
1674 |     "      \"keyword_4\": \"\",\n",
1675 |     "      \"keyword_5\": \"\"\n",
1676 |     "    }}\n",
1677 |     "    Here's the query:\n",
1678 |     "    {query_text}\n",
1679 |     "    \"\"\"\n",
1680 |     "    \n",
1681 |     "\n",
1682 |     "    llm = ChatOpenAI(\n",
1683 |     "    model=\"gpt-4o-mini\",\n",
1684 |     "    temperature=0,\n",
1685 |     "    max_tokens=None,\n",
1686 |     "    timeout=None,\n",
1687 |     "    max_retries=2\n",
1688 |     "    )\n",
1689 |     "\n",
1690 |     "    json_llm = llm.bind(response_format={\"type\": \"json_object\"})\n",
1691 |     "\n",
1692 |     "\n",
1693 |     "    #Craft the prompt message\n",
1694 |     "    messages=[(\"human\",prompt)]\n",
1695 |     "\n",
1696 |     "\n",
1697 |     "    # Invoke the LLM\n",
1698 |     "    ai_msg = json_llm.invoke(messages)\n",
1699 |     "\n",
1700 |     "\n",
1701 |     "\n",
1702 |     "\n",
1703 |     "    # Extract the response in JSON format\n",
1704 |     "    metadata_response = ai_msg.content\n",
1705 |     "    try:\n",
1706 |     "        # Convert the response into a dictionary\n",
1707 |     "        metadata = eval(metadata_response)\n",
1708 |     "    except Exception as e:\n",
1709 |     "        print(f\"Error parsing metadata: {e}\")\n",
1710 |     "        metadata = {\n",
1711 |     "            \"player_1\": \"\", \"player_2\": \"\", \"player_3\": \"\", \"player_4\": \"\", \"player_5\": \"\",\n",
1712 |     "            \"team_1\": \"\", \"team_2\": \"\", \"team_3\": \"\", \"team_4\": \"\", \"team_5\": \"\",\n",
1713 |     "            \"keyword_1\": \"\", \"keyword_2\": \"\", \"keyword_3\": \"\", \"keyword_4\": \"\", \"keyword_5\": \"\"\n",
1714 |     "        }\n",
1715 |     "    return metadata\n",
1716 |     "\n",
1717 |     "# Step 10: Extract metadata from the query\n",
1718 |     "query = \"Virat Kohli records in 2023 Cricket World Cup\"\n",
1719 |     "query_metadata = extract_fixed_metadata_from_query(query)\n",
1720 |     "\n",
1721 |     "# Step 11: Define a metadata filter based on the query's extracted metadata\n",
1722 |     "def metadata_filter(doc_metadata):\n",
1723 |     "    query_players = {query_metadata[f\"player_{i}\"] for i in range(1, 6) if query_metadata[f\"player_{i}\"]}\n",
1724 |     "    query_teams = {query_metadata[f\"team_{i}\"] for i in range(1, 6) if query_metadata[f\"team_{i}\"]}\n",
1725 |     "    query_keywords = {query_metadata[f\"keyword_{i}\"] for i in range(1, 6) if query_metadata[f\"keyword_{i}\"]}\n",
1726 |     "    doc_players = {doc_metadata[\"extracted_metadata\"][f\"player_{i}\"] for i in range(1, 6) if doc_metadata[\"extracted_metadata\"][f\"player_{i}\"]}\n",
1727 |     "    doc_teams = {doc_metadata[\"extracted_metadata\"][f\"team_{i}\"] for i in range(1, 6) if doc_metadata[\"extracted_metadata\"][f\"team_{i}\"]}\n",
1728 |     "    doc_keywords = {doc_metadata[\"extracted_metadata\"][f\"keyword_{i}\"] for i in range(1, 6) if doc_metadata[\"extracted_metadata\"][f\"keyword_{i}\"]}\n",
1729 |     "    \n",
1730 |     "    # Check if there's any overlap between the query metadata and document metadata\n",
1731 |     "    return bool(query_players & doc_players or query_teams & doc_teams or query_keywords & doc_keywords)\n",
1732 |     "\n",
1733 |     "# Step 12: Perform a similarity search on the stored chunks with the metadata filter\n",
1734 |     "results = vector_store.similarity_search(query=query, k=3, filter=metadata_filter)\n",
1735 |     "\n",
1736 |     "# Step 13: Display the results with metadata\n",
1737 |     "for doc in results:\n",
1738 |     "    print(f\"Document: {doc.page_content}\")\n",
1739 |     "    print(f\"Metadata: {doc.metadata}\")\n"
1740 |    ]
1741 |   },
1742 |   {
1743 |    "cell_type": "markdown",
1744 |    "metadata": {},
1745 |    "source": [
1746 |     "### 1.2 QUERY OPTIMIZATION\n",
1747 |     "\n",
1748 |     "The objective of this stage is to optimize the input user query in a manner that makes it better suited for the retrieval tasks"
1749 |    ]
1750 |   },
1751 |   {
1752 |    "cell_type": "markdown",
1753 |    "metadata": {},
1754 |    "source": [
1755 |     "#### Query Expansion\n",
1756 |     "\n",
1757 |     "In query expansion, the original user query is enriched with the aim of retrieving more relevant information. This helps in increasing the recall of the system and overcomes the challenge of incomplete or very brief user queries."
1758 |    ]
1759 |   },
1760 |   {
1761 |    "cell_type": "code",
1762 |    "execution_count": 28,
1763 |    "metadata": {},
1764 |    "outputs": [],
1765 |    "source": [
1766 |     "original_query=\"How does climate change affect polar bears?\"\n",
1767 |     "num=5"
1768 |    ]
1769 |   },
1770 |   {
1771 |    "cell_type": "code",
1772 |    "execution_count": 29,
1773 |    "metadata": {},
1774 |    "outputs": [],
1775 |    "source": [
1776 |     "response_structure='''\n",
1777 |     "{\n",
1778 |     "    \"queries\": [\n",
1779 |     "        {\n",
1780 |     "            \"query\": \"query\",\n",
1781 |     "    },\n",
1782 |     "    ...\n",
1783 |     "]}\n",
1784 |     "'''"
1785 |    ]
1786 |   },
1787 |   {
1788 |    "cell_type": "code",
1789 |    "execution_count": 30,
1790 |    "metadata": {},
1791 |    "outputs": [
1792 |     {
1793 |      "name": "stderr",
1794 |      "output_type": "stream",
1795 |      "text": [
1796 |       "<>:1: SyntaxWarning: invalid escape sequence '\\S'\n",
1797 |       "<>:1: SyntaxWarning: invalid escape sequence '\\S'\n",
1798 |       "/var/folders/kz/0m0zvdwn54798h3cfz47bcjw0000gn/T/ipykernel_29711/22333251.py:1: SyntaxWarning: invalid escape sequence '\\S'\n",
1799 |       "  expansion_prompt=f\"Generate {num} variations of the following query: {original_query}. Respond in JSON format.\\Stick to this Structure :\\n{response_structure}\"\n"
1800 |      ]
1801 |     }
1802 |    ],
1803 |    "source": [
1804 |     "expansion_prompt=f\"Generate {num} variations of the following query: {original_query}. Respond in JSON format.\\Stick to this Structure :\\n{response_structure}\""
1805 |    ]
1806 |   },
1807 |   {
1808 |    "cell_type": "code",
1809 |    "execution_count": 31,
1810 |    "metadata": {},
1811 |    "outputs": [],
1812 |    "source": [
1813 |     "step_back_expansion_prompt = f\"Given the query: '{original_query}', generate a more abstract, higher-level conceptual query.\""
1814 |    ]
1815 |   },
1816 |   {
1817 |    "cell_type": "code",
1818 |    "execution_count": 32,
1819 |    "metadata": {},
1820 |    "outputs": [],
1821 |    "source": [
1822 |     "sub_query_expansion_prompt=f\"Break down the following query into {num} sub-queries targeting different aspects of the query: '{original_query}'. Respond in JSON format.\"\n"
1823 |    ]
1824 |   },
1825 |   {
1826 |    "cell_type": "code",
1827 |    "execution_count": 33,
1828 |    "metadata": {},
1829 |    "outputs": [],
1830 |    "source": [
1831 |     "# Importing the OpenAI library\n",
1832 |     "from openai import OpenAI\n",
1833 |     "\n",
1834 |     "# Instantiate the OpenAI client\n",
1835 |     "client = OpenAI()\n",
1836 |     "\n",
1837 |     "# Make the API call passing the augmented prompt to the LLM\n",
1838 |     "response = client.chat.completions.create(\n",
1839 |     "  model=\"gpt-4o-mini\",\n",
1840 |     "  messages=\t[\n",
1841 |     "    {\"role\": \"user\", \"content\": expansion_prompt}\n",
1842 |     "  \t\t],\n",
1843 |     "          response_format={ \"type\": \"json_object\" }\n",
1844 |     ")\n",
1845 |     "\n",
1846 |     "# Extract the answer from the response object\n",
1847 |     "answer=response.choices[0].message.content"
1848 |    ]
1849 |   },
1850 |   {
1851 |    "cell_type": "code",
1852 |    "execution_count": 34,
1853 |    "metadata": {},
1854 |    "outputs": [
1855 |     {
1856 |      "name": "stdout",
1857 |      "output_type": "stream",
1858 |      "text": [
1859 |       "{\n",
1860 |       "    \"queries\": [\n",
1861 |       "        {\n",
1862 |       "            \"query\": \"What impact does climate change have on polar bear populations?\"\n",
1863 |       "        },\n",
1864 |       "        {\n",
1865 |       "            \"query\": \"In what ways does climate change influence the habitat of polar bears?\"\n",
1866 |       "        },\n",
1867 |       "        {\n",
1868 |       "            \"query\": \"How are polar bears being affected by global warming?\"\n",
1869 |       "        },\n",
1870 |       "        {\n",
1871 |       "            \"query\": \"What are the consequences of climate change for polar bear survival?\"\n",
1872 |       "        },\n",
1873 |       "        {\n",
1874 |       "            \"query\": \"How is the behavior of polar bears changing due to climate change?\"\n",
1875 |       "        }\n",
1876 |       "    ]\n",
1877 |       "}\n"
1878 |      ]
1879 |     }
1880 |    ],
1881 |    "source": [
1882 |     "print(answer)"
1883 |    ]
1884 |   },
1885 |   {
1886 |    "cell_type": "code",
1887 |    "execution_count": 35,
1888 |    "metadata": {},
1889 |    "outputs": [],
1890 |    "source": [
1891 |     "\n",
1892 |     "\n",
1893 |     "# Make the API call passing the augmented prompt to the LLM\n",
1894 |     "response = client.chat.completions.create(\n",
1895 |     "  model=\"gpt-4o-mini\",\n",
1896 |     "  messages=\t[\n",
1897 |     "    {\"role\": \"user\", \"content\": step_back_expansion_prompt}\n",
1898 |     "  ]\n",
1899 |     ")\n",
1900 |     "\n",
1901 |     "# Extract the answer from the response object\n",
1902 |     "answer=response.choices[0].message.content"
1903 |    ]
1904 |   },
1905 |   {
1906 |    "cell_type": "code",
1907 |    "execution_count": 36,
1908 |    "metadata": {},
1909 |    "outputs": [
1910 |     {
1911 |      "name": "stdout",
1912 |      "output_type": "stream",
1913 |      "text": [
1914 |       "\"What are the broader ecological and environmental impacts of climate change on specialized species in arctic ecosystems?\"\n"
1915 |      ]
1916 |     }
1917 |    ],
1918 |    "source": [
1919 |     "print(answer)"
1920 |    ]
1921 |   },
1922 |   {
1923 |    "cell_type": "code",
1924 |    "execution_count": 37,
1925 |    "metadata": {},
1926 |    "outputs": [],
1927 |    "source": [
1928 |     "\n",
1929 |     "# Make the API call passing the augmented prompt to the LLM\n",
1930 |     "response = client.chat.completions.create(\n",
1931 |     "  model=\"gpt-4o-mini\",\n",
1932 |     "  messages=\t[\n",
1933 |     "    {\"role\": \"user\", \"content\": sub_query_expansion_prompt}\n",
1934 |     "  ],\n",
1935 |     "  response_format={ \"type\": \"json_object\" }\n",
1936 |     ")\n",
1937 |     "\n",
1938 |     "# Extract the answer from the response object\n",
1939 |     "answer=response.choices[0].message.content"
1940 |    ]
1941 |   },
1942 |   {
1943 |    "cell_type": "code",
1944 |    "execution_count": 38,
1945 |    "metadata": {},
1946 |    "outputs": [
1947 |     {
1948 |      "name": "stdout",
1949 |      "output_type": "stream",
1950 |      "text": [
1951 |       "{\n",
1952 |       "  \"sub_queries\": [\n",
1953 |       "    {\n",
1954 |       "      \"id\": 1,\n",
1955 |       "      \"focus\": \"Impact on Habitat\",\n",
1956 |       "      \"query\": \"What changes in habitat occur for polar bears due to climate change?\"\n",
1957 |       "    },\n",
1958 |       "    {\n",
1959 |       "      \"id\": 2,\n",
1960 |       "      \"focus\": \"Food Availability\",\n",
1961 |       "      \"query\": \"How does climate change affect the availability of food sources for polar bears?\"\n",
1962 |       "    },\n",
1963 |       "    {\n",
1964 |       "      \"id\": 3,\n",
1965 |       "      \"focus\": \"Reproductive Health\",\n",
1966 |       "      \"query\": \"What are the effects of climate change on the reproductive health and population dynamics of polar bears?\"\n",
1967 |       "    },\n",
1968 |       "    {\n",
1969 |       "      \"id\": 4,\n",
1970 |       "      \"focus\": \"Behavioral Changes\",\n",
1971 |       "      \"query\": \"How does climate change influence the behavior and migration patterns of polar bears?\"\n",
1972 |       "    },\n",
1973 |       "    {\n",
1974 |       "      \"id\": 5,\n",
1975 |       "      \"focus\": \"Long-term Survival\",\n",
1976 |       "      \"query\": \"What are the long-term implications of climate change on the survival of polar bear populations?\"\n",
1977 |       "    }\n",
1978 |       "  ]\n",
1979 |       "}\n"
1980 |      ]
1981 |     }
1982 |    ],
1983 |    "source": [
1984 |     "print(answer)"
1985 |    ]
1986 |   },
1987 |   {
1988 |    "cell_type": "markdown",
1989 |    "metadata": {},
1990 |    "source": [
1991 |     "#### Query Transformation\n",
1992 |     "\n",
1993 |     "Compared to query expansion, in query transformation, instead of the original user query retrieval happens on a transformed query which is more suitable for the retriever"
1994 |    ]
1995 |   },
1996 |   {
1997 |    "cell_type": "code",
1998 |    "execution_count": 39,
1999 |    "metadata": {},
2000 |    "outputs": [],
2001 |    "source": [
2002 |     "original_query=\"How does climate change affect polar bears?\""
2003 |    ]
2004 |   },
2005 |   {
2006 |    "cell_type": "code",
2007 |    "execution_count": 40,
2008 |    "metadata": {},
2009 |    "outputs": [],
2010 |    "source": [
2011 |     "system_prompt=\"You are an expert in climate change and arctic life.\"\n",
2012 |     "hyde_prompt=f\"Generate an answer to the question: {original_query}\""
2013 |    ]
2014 |   },
2015 |   {
2016 |    "cell_type": "code",
2017 |    "execution_count": 41,
2018 |    "metadata": {},
2019 |    "outputs": [],
2020 |    "source": [
2021 |     "\n",
2022 |     "# Make the API call passing the augmented prompt to the LLM\n",
2023 |     "response = client.chat.completions.create(\n",
2024 |     "  model=\"gpt-4o-mini\",\n",
2025 |     "  messages=\t[\n",
2026 |     "    {\"role\": \"system\", \"content\": system_prompt},\n",
2027 |     "    {\"role\": \"user\", \"content\": hyde_prompt}\n",
2028 |     "  ]\n",
2029 |     ")\n",
2030 |     "\n",
2031 |     "# Extract the answer from the response object\n",
2032 |     "answer=response.choices[0].message.content"
2033 |    ]
2034 |   },
2035 |   {
2036 |    "cell_type": "code",
2037 |    "execution_count": 42,
2038 |    "metadata": {},
2039 |    "outputs": [
2040 |     {
2041 |      "name": "stdout",
2042 |      "output_type": "stream",
2043 |      "text": [
2044 |       "Climate change significantly affects polar bears primarily through the loss of sea ice, which is crucial for their survival. Here are some key ways in which climate change impacts polar bears:\n",
2045 |       "\n",
2046 |       "1. **Loss of Habitat**: Polar bears depend on sea ice as a platform for hunting seals, their primary food source. As global temperatures rise, sea ice is melting at an alarming rate during the summer months and forming later in the fall. This reduced availability of ice means bears have to swim longer distances to find food.\n",
2047 |       "\n",
2048 |       "2. **Decreased Prey Availability**: As ice melts, the seals that polar bears hunt also face challenges. With the loss of stable ice platforms, seal populations may decrease, ultimately leading to food scarcity for polar bears. This reduced access to prey impacts their health, reproduction, and survival rates.\n",
2049 |       "\n",
2050 |       "3. **Increased Energy Expenditure**: With the melting ice, polar bears may have to travel further in search of food. This increased energy expenditure can lead to fatigue and decreased body condition, especially for mothers who are nursing cubs. Hungry bears are less likely to successfully raise their young.\n",
2051 |       "\n",
2052 |       "4. **Climate-Induced Stressors**: The stress of adapting to rapidly changing environments can lead to behavioral changes in polar bears. Increased competition for dwindling resources can also lead to aggression among bears, particularly in areas where human activities are encroaching on their habitat.\n",
2053 |       "\n",
2054 |       "5. **Reproductive Challenges**: The combined effects of reduced food availability and habitat loss affect polar bear reproduction. Poor body condition can result in lower birth rates and higher cub mortality, threatening the long-term viability of polar bear populations.\n",
2055 |       "\n",
2056 |       "6. **Increased Human-Bear Conflicts**: As bears are forced to venture onto land more frequently due to diminishing sea ice, encounters with human populations can increase. This may lead to more conflicts, potential dangers for both bears and humans, and could affect the conservation strategies in these areas.\n",
2057 |       "\n",
2058 |       "In summary, climate change poses a significant threat to polar bears by disrupting their habitat, food sources, and overall survival. Protecting their environment and mitigating climate change impacts are crucial for the future of these iconic Arctic mammals.\n"
2059 |      ]
2060 |     }
2061 |    ],
2062 |    "source": [
2063 |     "print(answer)"
2064 |    ]
2065 |   },
2066 |   {
2067 |    "cell_type": "code",
2068 |    "execution_count": 44,
2069 |    "metadata": {},
2070 |    "outputs": [
2071 |     {
2072 |      "name": "stdout",
2073 |      "output_type": "stream",
2074 |      "text": [
2075 |       "The embedding dimension is: 1536\n"
2076 |      ]
2077 |     }
2078 |    ],
2079 |    "source": [
2080 |     "# Initialize the OpenAIEmbeddings model\n",
2081 |     "embeddings = OpenAIEmbeddings(model=\"text-embedding-3-small\")\n",
2082 |     "\n",
2083 |     "# Create embedding for the hypothetical answer\n",
2084 |     "hyde_embedding = embeddings.embed_query(answer)\n",
2085 |     "\n",
2086 |     "# Check and print the dimension of the embedding\n",
2087 |     "embedding_dimension = len(hyde_embedding)\n",
2088 |     "print(f\"The embedding dimension is: {embedding_dimension}\")\n"
2089 |    ]
2090 |   },
2091 |   {
2092 |    "cell_type": "markdown",
2093 |    "metadata": {},
2094 |    "source": [
2095 |     "## 2. Retrieval Strategies"
2096 |    ]
2097 |   },
2098 |   {
2099 |    "cell_type": "markdown",
2100 |    "metadata": {},
2101 |    "source": [
2102 |     "Interventions in the pre-retrieval stage can bring significant improvements in the performance of the RAG system if the query and the knowledge base becomes well aligned with the retrieval algorithm. "
2103 |    ]
2104 |   },
2105 |   {
2106 |    "cell_type": "markdown",
2107 |    "metadata": {},
2108 |    "source": [
2109 |     "#### Hybrid Retrieval"
2110 |    ]
2111 |   },
2112 |   {
2113 |    "cell_type": "code",
2114 |    "execution_count": 46,
2115 |    "metadata": {},
2116 |    "outputs": [
2117 |     {
2118 |      "name": "stderr",
2119 |      "output_type": "stream",
2120 |      "text": [
2121 |       "Fetching pages: 100%|##########| 1/1 [00:00<00:00,  3.25it/s]\n"
2122 |      ]
2123 |     },
2124 |     {
2125 |      "name": "stdout",
2126 |      "output_type": "stream",
2127 |      "text": [
2128 |       "Retrieval Type: dense\n",
2129 |       "Result: 46. **^** \"It's official! India set up 2023 World Cup semi-final against New Zealand in 2019 rematch; Pakistan knocked out\". _Hindustan Times_. 11 November 2023. Archived from the original on 14 November 2023. Retrieved 12 November 2023.\n",
2130 |       "  47. **^** \"2023 World Cup Cricket Batting Records & Stats runs\". _ESPNcricinfo_. Archived from the original on 18 October 2023. Retrieved 19 October 2023.\n",
2131 |       "\n",
2132 |       "Retrieval Type: dense\n",
2133 |       "Result: 48. **^** \"2023 World Cup Cricket bowling Records & Stats wickets\". _ESPNcricinfo_. Archived from the original on 9 October 2023. Retrieved 10 October 2023.\n",
2134 |       "  49. **^** \"India star named Player of the Tournament at ICC Men's Cricket World Cup\". _Cricket World Cup_. Archived from the original on 19 November 2023. Retrieved 19 November 2023.\n",
2135 |       "\n",
2136 |       "Retrieval Type: dense\n",
2137 |       "Result: Main article: 2023 Cricket World Cup final\n",
2138 |       "\n",
2139 |       "19 November 2023  \n",
2140 |       "14:00 (D/N)  \n",
2141 |       "Scorecard  \n",
2142 |       "---  \n",
2143 |       "**India **  \n",
2144 |       "240 (50 overs) | **v** | **Australia**  \n",
2145 |       "241/4 (43 overs)  \n",
2146 |       "---|---|---  \n",
2147 |       "|  |   \n",
2148 |       "**Australia won by 6 wickets**  \n",
2149 |       "Narendra Modi Stadium, Ahmedabad  \n",
2150 |       "---  \n",
2151 |       "  \n",
2152 |       "## Statistics\n",
2153 |       "\n",
2154 |       "Main article: 2023 Cricket World Cup statistics\n",
2155 |       "\n",
2156 |       "### Most runs\n",
2157 |       "\n",
2158 |       "Retrieval Type: dense\n",
2159 |       "Result: 2023 ICC Men's Cricket World Cup  \n",
2160 |       "---  \n",
2161 |       "Dates| 5 October – 19 November 2023  \n",
2162 |       "Administrator(s)| International Cricket Council  \n",
2163 |       "Cricket format| One Day International (ODI)  \n",
2164 |       "Tournament format(s)| Round-robin and knockout  \n",
2165 |       "Host(s)| India  \n",
2166 |       "Champions|  Australia (6th title)  \n",
2167 |       "Runners-up|  India  \n",
2168 |       "Participants| 10  \n",
2169 |       "Matches| 48  \n",
2170 |       "Attendance| 1,250,307 (26,048 per match)  \n",
2171 |       "Player of the series|  Virat Kohli  \n",
2172 |       "Most runs|  Virat Kohli (765)  \n",
2173 |       "Most wickets|  Mohammed Shami (24)\n",
2174 |       "\n",
2175 |       "Retrieval Type: dense\n",
2176 |       "Result: 53. ^ _**a**_ _**b**_ Pennington, Adrian. \"Behind the ICC Men's Cricket World Cup 2023 innovations with vertical video feed plus ball and player tracking\". _SVG Europe_. Retrieved 15 February 2024.\n",
2177 |       "  54. ^ _**a**_ _**b**_ \"Disney sets India cricket viewership record for TV, streaming during world cup\". _Reuters_. 23 November 2023. Retrieved 14 February 2024.\n",
2178 |       "\n",
2179 |       "Retrieval Type: sparse\n",
2180 |       "Result: Virat Kohli was named the player of the tournament and also scored the most\n",
2181 |       "runs, while Mohammed Shami was the leading wicket-taker. A total of 1,250,307\n",
2182 |       "spectators attended the matches, the highest number in any Cricket World Cup\n",
2183 |       "to date.[1] The tournament final set viewership records in India, drawing 518\n",
2184 |       "million viewers, with a peak of 57 million streaming viewers.\n",
2185 |       "\n",
2186 |       "## Background\n",
2187 |       "\n",
2188 |       "### Host selection\n",
2189 |       "\n",
2190 |       "Retrieval Type: sparse\n",
2191 |       "Result: Participants| 10  \n",
2192 |       "Matches| 48  \n",
2193 |       "Attendance| 1,250,307 (26,048 per match)  \n",
2194 |       "Player of the series|  Virat Kohli  \n",
2195 |       "Most runs|  Virat Kohli (765)  \n",
2196 |       "Most wickets|  Mohammed Shami (24)  \n",
2197 |       "Official website| cricketworldcup.com  \n",
2198 |       "<- 2019 _2027_ ->  \n",
2199 |       "**Part of a series on the**  \n",
2200 |       "---  \n",
2201 |       "2023 Cricket World Cup /  \n",
2202 |       "2025 ICC Champions Trophy  \n",
2203 |       "CWC:  Category •  Commons  \n",
2204 |       "CT:  Category •  Commons  \n",
2205 |       "2023 Cricket World Cup  \n",
2206 |       "Background\n",
2207 |       "\n",
2208 |       "Retrieval Type: sparse\n",
2209 |       "Result: 2023 ICC Men's Cricket World Cup  \n",
2210 |       "---  \n",
2211 |       "Dates| 5 October – 19 November 2023  \n",
2212 |       "Administrator(s)| International Cricket Council  \n",
2213 |       "Cricket format| One Day International (ODI)  \n",
2214 |       "Tournament format(s)| Round-robin and knockout  \n",
2215 |       "Host(s)| India  \n",
2216 |       "Champions|  Australia (6th title)  \n",
2217 |       "Runners-up|  India  \n",
2218 |       "Participants| 10  \n",
2219 |       "Matches| 48  \n",
2220 |       "Attendance| 1,250,307 (26,048 per match)  \n",
2221 |       "Player of the series|  Virat Kohli  \n",
2222 |       "Most runs|  Virat Kohli (765)  \n",
2223 |       "Most wickets|  Mohammed Shami (24)\n",
2224 |       "\n",
2225 |       "Retrieval Type: sparse\n",
2226 |       "Result: Main article: 2023 Cricket World Cup statistics\n",
2227 |       "\n",
2228 |       "### Most runs\n",
2229 |       "\n",
2230 |       "Runs  | Player  | Team   \n",
2231 |       "---|---|---  \n",
2232 |       "765  | Virat Kohli |  India  \n",
2233 |       "597  | Rohit Sharma |  India  \n",
2234 |       "594  | Quinton de Kock |  South Africa  \n",
2235 |       "578  | Rachin Ravindra |  New Zealand  \n",
2236 |       "552  | Daryl Mitchell |  New Zealand  \n",
2237 |       "  \n",
2238 |       "  * Source: ESPNcricinfo[47]\n",
2239 |       "\n",
2240 |       "### Most wickets\n",
2241 |       "\n"
2242 |      ]
2243 |     },
2244 |     {
2245 |      "name": "stderr",
2246 |      "output_type": "stream",
2247 |      "text": [
2248 |       "/var/folders/kz/0m0zvdwn54798h3cfz47bcjw0000gn/T/ipykernel_29711/394055649.py:58: LangChainDeprecationWarning: The method `BaseRetriever.get_relevant_documents` was deprecated in langchain-core 0.1.46 and will be removed in 1.0. Use :meth:`~invoke` instead.\n",
2249 |       "  sparse_results = bm25_retriever.get_relevant_documents(query)\n"
2250 |      ]
2251 |     }
2252 |    ],
2253 |    "source": [
2254 |     "import faiss\n",
2255 |     "from langchain_community.vectorstores import FAISS\n",
2256 |     "from langchain_community.docstore.in_memory import InMemoryDocstore\n",
2257 |     "from langchain_openai import OpenAIEmbeddings\n",
2258 |     "from langchain_core.documents import Document\n",
2259 |     "from langchain_community.document_loaders import AsyncHtmlLoader\n",
2260 |     "from langchain_community.document_transformers import Html2TextTransformer\n",
2261 |     "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
2262 |     "from langchain_community.retrievers import BM25Retriever\n",
2263 |     "\n",
2264 |     "# Step 1: Load data from a URL (Wikipedia page)\n",
2265 |     "url = \"https://en.wikipedia.org/wiki/2023_Cricket_World_Cup\"\n",
2266 |     "loader = AsyncHtmlLoader(url)\n",
2267 |     "data = loader.load()\n",
2268 |     "\n",
2269 |     "# Step 2: Transform the HTML content to plain text\n",
2270 |     "html2text = Html2TextTransformer()\n",
2271 |     "data_transformed = html2text.transform_documents(data)\n",
2272 |     "\n",
2273 |     "# Step 3: Split the text into smaller chunks using RecursiveCharacterTextSplitter\n",
2274 |     "text_splitter = RecursiveCharacterTextSplitter(\n",
2275 |     "    chunk_size=500,  # Number of characters in each chunk\n",
2276 |     "    chunk_overlap=200  # Number of overlapping characters between chunks\n",
2277 |     ")\n",
2278 |     "chunks = text_splitter.split_text(data_transformed[0].page_content)\n",
2279 |     "\n",
2280 |     "# Step 4: Dense Retrieval (FAISS + OpenAI Embeddings)\n",
2281 |     "\n",
2282 |     "# Initialize OpenAI Embeddings model for dense retrieval\n",
2283 |     "embedding_model = OpenAIEmbeddings(model=\"text-embedding-ada-002\")\n",
2284 |     "\n",
2285 |     "# Initialize FAISS index for dense retrieval\n",
2286 |     "embedding_dim = len(embedding_model.embed_query(\"hello world\"))\n",
2287 |     "index = faiss.IndexFlatL2(embedding_dim)\n",
2288 |     "\n",
2289 |     "# Create an in-memory document store to support adding documents\n",
2290 |     "docstore = InMemoryDocstore()\n",
2291 |     "\n",
2292 |     "# Initialize FAISS vector store\n",
2293 |     "vector_store = FAISS(embedding_function=embedding_model, index=index, docstore=docstore, index_to_docstore_id={})\n",
2294 |     "\n",
2295 |     "# Add chunks to FAISS vector store\n",
2296 |     "documents = [Document(page_content=chunk) for chunk in chunks]\n",
2297 |     "vector_store.add_documents(documents)\n",
2298 |     "\n",
2299 |     "# Step 5: Sparse Retrieval (BM25 using LangChain's BM25Retriever)\n",
2300 |     "\n",
2301 |     "# Initialize BM25Retriever\n",
2302 |     "bm25_retriever = BM25Retriever.from_documents(documents)\n",
2303 |     "\n",
2304 |     "# Step 6: Hybrid Retrieval Strategy\n",
2305 |     "\n",
2306 |     "def hybrid_search(query, k=5):\n",
2307 |     "    # Step 6.1: Perform dense retrieval using FAISS\n",
2308 |     "    dense_results = vector_store.similarity_search(query=query, k=k)\n",
2309 |     "    \n",
2310 |     "    # Step 6.2: Perform sparse retrieval using BM25Retriever\n",
2311 |     "    sparse_results = bm25_retriever.get_relevant_documents(query)\n",
2312 |     "    \n",
2313 |     "    # Limit sparse results to top-k\n",
2314 |     "    sparse_results = sparse_results[:k]\n",
2315 |     "    \n",
2316 |     "    # Step 6.3: Combine dense and sparse results\n",
2317 |     "    combined_results = []\n",
2318 |     "    for dense_doc in dense_results:\n",
2319 |     "        combined_results.append((\"dense\", dense_doc.page_content))\n",
2320 |     "\n",
2321 |     "    for sparse_doc in sparse_results:\n",
2322 |     "        combined_results.append((\"sparse\", sparse_doc.page_content))\n",
2323 |     "\n",
2324 |     "    # Optionally, re-rank or further process combined results\n",
2325 |     "    return combined_results\n",
2326 |     "\n",
2327 |     "# Step 7: Perform a hybrid search\n",
2328 |     "query = \"Virat Kohli records in 2023 Cricket World Cup\"\n",
2329 |     "hybrid_results = hybrid_search(query)\n",
2330 |     "\n",
2331 |     "# Step 8: Display the results\n",
2332 |     "for retrieval_type, result in hybrid_results:\n",
2333 |     "    print(f\"Retrieval Type: {retrieval_type}\")\n",
2334 |     "    print(f\"Result: {result}\\n\")\n"
2335 |    ]
2336 |   },
2337 |   {
2338 |    "cell_type": "markdown",
2339 |    "metadata": {},
2340 |    "source": [
2341 |     "## 3. Post Retrieval Stage"
2342 |    ]
2343 |   },
2344 |   {
2345 |    "cell_type": "markdown",
2346 |    "metadata": {},
2347 |    "source": [
2348 |     "At the post-retrieval stage the approaches of reranking and compression help in providing better context to the LLM for generation."
2349 |    ]
2350 |   },
2351 |   {
2352 |    "cell_type": "markdown",
2353 |    "metadata": {},
2354 |    "source": [
2355 |     "#### Compression"
2356 |    ]
2357 |   },
2358 |   {
2359 |    "cell_type": "markdown",
2360 |    "metadata": {},
2361 |    "source": [
2362 |     "In prompt compression, language models are used to detect and remove unimportant and irrelevant tokens"
2363 |    ]
2364 |   },
2365 |   {
2366 |    "cell_type": "code",
2367 |    "execution_count": 47,
2368 |    "metadata": {},
2369 |    "outputs": [],
2370 |    "source": [
2371 |     "document_to_compress=retrieved_docs[0].page_content"
2372 |    ]
2373 |   },
2374 |   {
2375 |    "cell_type": "code",
2376 |    "execution_count": 48,
2377 |    "metadata": {},
2378 |    "outputs": [],
2379 |    "source": [
2380 |     "compress_prompt = f\"Compress the following document into very short sentences, retaining only the extremely essential information:\\n\\n{document_to_compress}\""
2381 |    ]
2382 |   },
2383 |   {
2384 |    "cell_type": "code",
2385 |    "execution_count": 49,
2386 |    "metadata": {},
2387 |    "outputs": [],
2388 |    "source": [
2389 |     "# Make the API call passing the augmented prompt to the LLM\n",
2390 |     "response = client.chat.completions.create(\n",
2391 |     "  model=\"gpt-4o-mini\",\n",
2392 |     "  messages=\t[\n",
2393 |     "    {\"role\": \"user\", \"content\": compress_prompt}\n",
2394 |     "  ]\n",
2395 |     ")\n",
2396 |     "\n",
2397 |     "# Extract the answer from the response object\n",
2398 |     "answer=response.choices[0].message.content"
2399 |    ]
2400 |   },
2401 |   {
2402 |    "cell_type": "code",
2403 |    "execution_count": 50,
2404 |    "metadata": {},
2405 |    "outputs": [
2406 |     {
2407 |      "name": "stdout",
2408 |      "output_type": "stream",
2409 |      "text": [
2410 |       "The 2023 ICC Men's Cricket World Cup took place in India from October 5 to\n",
2411 |       "November 19, 2023. It was the 13th ODI tournament with ten teams. Australia won,\n",
2412 |       "defeating India in the final. The match was held at the Narendra Modi Stadium in\n",
2413 |       "Ahmedabad. Attendance exceeded 1.25 million, and the final had 518 million\n",
2414 |       "viewers. Virat Kohli was Player of the Series with 765 runs. Mohammed Shami took\n",
2415 |       "the most wickets, with 24. The event was postponed from early 2023 due to\n",
2416 |       "COVID-19. New penalties for slow over-rates were introduced. The format included\n",
2417 |       "a round-robin stage and knockout rounds.  Final Match: - Date: 19 November 2023\n",
2418 |       "- India: 240 (50 overs) - Australia: 241/4 (43 overs) - Australia won by 6\n",
2419 |       "wickets.  Top Performers: - Most Runs: Virat Kohli (765), Rohit Sharma (597),\n",
2420 |       "Quinton de Kock (594). - Most Wickets: Mohammed Shami (24), Adam Zampa (23),\n",
2421 |       "Dilshan Madushanka (21).\n"
2422 |      ]
2423 |     }
2424 |    ],
2425 |    "source": [
2426 |     "print(textwrap.fill(answer, width=80))"
2427 |    ]
2428 |   },
2429 |   {
2430 |    "cell_type": "code",
2431 |    "execution_count": 51,
2432 |    "metadata": {},
2433 |    "outputs": [
2434 |     {
2435 |      "name": "stdout",
2436 |      "output_type": "stream",
2437 |      "text": [
2438 |       "The 2023 ICC Men's Cricket World Cup, held in India from October 5 to November\n",
2439 |       "19, 2023, marked the 13th edition of this prestigious One Day International\n",
2440 |       "(ODI) tournament, featuring ten national teams. Australia emerged as champions,\n",
2441 |       "claiming their sixth title by defeating India in the final at the Narendra Modi\n",
2442 |       "Stadium in Ahmedabad. The tournament attracted a record attendance of over 1.25\n",
2443 |       "million spectators and set viewership records in India, with 518 million viewers\n",
2444 |       "for the final. Virat Kohli was named Player of the Series, scoring the most runs\n",
2445 |       "(765), while Mohammed Shami led in wickets taken (24). The event was initially\n",
2446 |       "scheduled for early 2023 but was postponed due to the COVID-19 pandemic, and it\n",
2447 |       "introduced new penalties for slow over-rates. The tournament format included a\n",
2448 |       "round-robin group stage followed by knockout rounds, culminating in a highly\n",
2449 |       "anticipated final. Main article: 2023 Cricket World Cup final  19 November 2023\n",
2450 |       "14:00 (D/N)   Scorecard   ---   **India **   240 (50 overs) | **v** |\n",
2451 |       "**Australia**   241/4 (43 overs)   ---|---|---   |  |    **Australia won by 6\n",
2452 |       "wickets**   Narendra Modi Stadium, Ahmedabad   ---      ## Statistics  Main\n",
2453 |       "article: 2023 Cricket World Cup statistics  ### Most runs  Runs  | Player  |\n",
2454 |       "Team    ---|---|---   765  | Virat Kohli |  India   597  | Rohit Sharma |  India\n",
2455 |       "594  | Quinton de Kock |  South Africa   578  | Rachin Ravindra |  New Zealand\n",
2456 |       "552  | Daryl Mitchell |  New Zealand        * Source: ESPNcricinfo[47]  ### Most\n",
2457 |       "wickets  Wickets  | Player  | Team    ---|---|---   24  | Mohammed Shami |\n",
2458 |       "India   23  | Adam Zampa |  Australia   21  | Dilshan Madushanka |  Sri Lanka\n",
2459 |       "20  | Jasprit Bumrah |  India   20  | Gerald Coetzee |  South Africa        *\n",
2460 |       "Source: ESPNcricinfo[48]  ### Team of the tournament\n"
2461 |      ]
2462 |     }
2463 |    ],
2464 |    "source": [
2465 |     "print(textwrap.fill(document_to_compress, width=80))"
2466 |    ]
2467 |   },
2468 |   {
2469 |    "cell_type": "markdown",
2470 |    "metadata": {},
2471 |    "source": [
2472 |     "---"
2473 |    ]
2474 |   },
2475 |   {
2476 |    "cell_type": "markdown",
2477 |    "metadata": {},
2478 |    "source": [
2479 |     "<img src=\"../../Assets/Images/profile_s.png\" width=100> \n",
2480 |     "\n",
2481 |     "Hi! I'm Abhinav! I am an entrepreneur and Vice President of Artificial Intelligence at Yarnit. I have spent over 15 years consulting and leadership roles in data science, machine learning and AI. My current focus is in the applied Generative AI domain focussing on solving enterprise needs through contextual intelligence. I'm passionate about AI advancements constantly exploring emerging technologies to push the boundaries and create positive impacts in the world. Let’s build the future, together!\n",
2482 |     "\n",
2483 |     "[If you haven't already, please subscribe to the MEAP of A Simple Guide to Retrieval Augmented Generation here](https://mng.bz/8wdg)\n",
2484 |     "\n",
2485 |     "<a href=\"https://mng.bz/8wdg\" target=\"_blank\">\n",
2486 |     "    <img src=\"../../Assets/Images/NewMEAPFooter.png\" alt=\"New MEAP\" style=\"width: 100%;\" />\n",
2487 |     "</a>\n",
2488 |     "\n",
2489 |     "#### If you'd like to chat, I'd be very happy to connect\n",
2490 |     "\n",
2491 |     "[![GitHub followers](https://img.shields.io/badge/Github-000000?style=for-the-badge&logo=github&logoColor=black&color=orange)](https://github.com/abhinav-kimothi)\n",
2492 |     "[![LinkedIn](https://img.shields.io/badge/LinkedIn-000000?style=for-the-badge&logo=linkedin&logoColor=orange&color=black)](https://www.linkedin.com/comm/mynetwork/discovery-see-all?usecase=PEOPLE_FOLLOWS&followMember=abhinav-kimothi)\n",
2493 |     "[![Medium](https://img.shields.io/badge/Medium-000000?style=for-the-badge&logo=medium&logoColor=black&color=orange)](https://medium.com/@abhinavkimothi)\n",
2494 |     "[![Insta](https://img.shields.io/badge/Instagram-000000?style=for-the-badge&logo=instagram&logoColor=orange&color=black)](https://www.instagram.com/akaiworks/)\n",
2495 |     "[![Mail](https://img.shields.io/badge/email-000000?style=for-the-badge&logo=gmail&logoColor=black&color=orange)](mailto:abhinav.kimothi.ds@gmail.com)\n",
2496 |     "[![X](https://img.shields.io/badge/Follow-000000?style=for-the-badge&logo=X&logoColor=orange&color=black)](https://twitter.com/abhinav_kimothi)\n",
2497 |     "[![Linktree](https://img.shields.io/badge/Linktree-000000?style=for-the-badge&logo=linktree&logoColor=black&color=orange)](https://linktr.ee/abhinavkimothi)\n",
2498 |     "[![Gumroad](https://img.shields.io/badge/Gumroad-000000?style=for-the-badge&logo=gumroad&logoColor=orange&color=black)](https://abhinavkimothi.gumroad.com/)\n",
2499 |     "\n",
2500 |     "---"
2501 |    ]
2502 |   },
2503 |   {
2504 |    "cell_type": "markdown",
2505 |    "metadata": {},
2506 |    "source": []
2507 |   }
2508 |  ],
2509 |  "metadata": {
2510 |   "kernelspec": {
2511 |    "display_name": ".ch6",
2512 |    "language": "python",
2513 |    "name": "python3"
2514 |   },
2515 |   "language_info": {
2516 |    "codemirror_mode": {
2517 |     "name": "ipython",
2518 |     "version": 3
2519 |    },
2520 |    "file_extension": ".py",
2521 |    "mimetype": "text/x-python",
2522 |    "name": "python",
2523 |    "nbconvert_exporter": "python",
2524 |    "pygments_lexer": "ipython3",
2525 |    "version": "3.13.2"
2526 |   }
2527 |  },
2528 |  "nbformat": 4,
2529 |  "nbformat_minor": 2
2530 | }
2531 | 


--------------------------------------------------------------------------------
/Chapters/Readme.md:
--------------------------------------------------------------------------------
 1 | # A Simple Introduction to RAG (Code Snippets)
 2 | 
 3 | <a href="https://mng.bz/8wdg" target="_blank">
 4 |     <img src="../Assets/Images/MEAP-HI.png" alt="New MEAP" style="width: 50%;" />
 5 | </a>
 6 | 
 7 | 
 8 | - Chapter 3 - Indexing Pipeline : Creating a knowledge base for RAG based applications [First draft Released](./Chapters/Chapter-03/indexing_pipeline.ipynb)
 9 | 
10 | - Chapter 4 - Generation Pipeline: Real time interaction for contextual responses [First draft Released](./Chapters/Chapter-04/generation_pipeline.ipynb)
11 | 
12 | - Chapter 5 - RAG Evaluation : Checking accuracy, relevance and faithfulness [First draft Released](./Chapters/Chapter-05/rag_evaluations.ipynb)
13 | 
14 | - Chapter 6 - Evolving RAGOps Stack : Technologies that make RAG possible [First draft Released](./Chapters/Chapter-06/advanced_rag.ipynb.ipynb)
15 | 
16 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Abhinav Kimothi
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # A Simple Guide to Retrieval Augmented Generation
  2 |  This repository is the source code for examples and illustrations discussed in the book - [A Simple Guide to Retrieval Augmented Generation](https://mng.bz/8wdg) published by [Manning Publications](https://www.manning.com/?utm_source=kimothi&utm_medium=affiliate&utm_campaign=affiliate&a_aid=kimothi)
  3 | 
  4 | <a href="https://mng.bz/8wdg" target="_blank">
  5 |     <img src="./Assets/Images/MEAP-HI.png" alt="New MEAP" style="width: 50%;" />
  6 | </a>
  7 | 
  8 |  Retrieval Augmented Generation, or RAG, stands as a pivotal technique shaping the landscape of the applied generative AI. A novel concept introduced by Lewis et al in their seminal paper Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks (https://arxiv.org/abs/2005.11401), RAG has swiftly emerged as a cornerstone, enhancing reliability and trustworthiness in the outputs from Large Language Models (LLMs)
  9 | 
 10 | About the book
 11 | ---
 12 | This book is a foundational guide designed particularly for beginners looking for an easy, yet comprehensive introduction to Retrieval Augmented Generation. This book does not go deep into the technical nitty-gritties of RAG rather provides an overview. Data Scientists, Data Engineers, ML Engineers, Software Developers, Technology Leaders, Students and Academicians interested in generative AI powered application development will find this book valuable. Upon completing this book, you can expect to:
 13 | 
 14 | - Develop a solid understanding of RAG fundamentals, the components of a RAG enabled system and its practical applications.
 15 | 
 16 | - Know what a non-parametric knowledge base for RAG means and how is it created.
 17 | 
 18 | - Gain knowledge about developing a RAG enabled system with details about the indexing pipeline and the generation pipeline.
 19 | - Gain deep insights into the evaluation of RAG enabled systems and modularised evaluation strategies
 20 | 
 21 | - Familiarize yourself with advanced RAG strategies and the evolving landscape
 22 | 
 23 | - Acquire knowledge of available tools, technologies and frameworks for building and deploying production grade RAG systems
 24 | 
 25 | - Get an understanding of the current limitations of RAG and an exposure to popular emerging techniques for further exploration
 26 | 
 27 | __Note: This book is still in development and is scheduled to be completed in the next few months__
 28 | 
 29 | Link to the [official source code repository](https://github.com/abhinav-kimothi/A-Simple-Introduction-to-RAG)
 30 | 
 31 | Link to [join the MEAP at manning.com](https://mng.bz/8wdg)
 32 | 
 33 | To download a copy of this repository, click on the [Download ZIP](https://github.com/abhinav-kimothi/A-Simple-Introduction-to-RAG/archive/refs/heads/main.zip) button or execute the following command in your terminal:
 34 | 
 35 | ```
 36 | git clone https://github.com/abhinav-kimothi/A-Simple-Introduction-to-RAG.git
 37 | ```
 38 | 
 39 | 
 40 | Table of Contents
 41 | ---
 42 | 
 43 | The first three chapters of the book have been released as a part of the Manning Early Access Program. You can [join the MEAP here](https://mng.bz/8wdg)
 44 | 
 45 | - Chapter 1 - Large Language Models and the Need for Retrieval Augmented Generation [First draft Released]
 46 | 
 47 | - Chapter 2 - RAG-enabled systems and their design [First draft Released]
 48 | 
 49 | - Chapter 3 - Indexing Pipeline : Creating a knowledge base for RAG based applications [First draft Released] [Notebook](./Chapters/Chapter-03/indexing_pipeline.ipynb)
 50 | 
 51 | - Chapter 4 - Generation Pipeline: Real time interaction for contextual responses [Notebook](./Chapters/Chapter-04/generation_pipeline.ipynb)
 52 | 
 53 | - Chapter 5 - RAG Evaluation : Checking accuracy, relevance and faithfulness [Notebook](./Chapters/Chapter-05/rag_evaluations.ipynb)
 54 | 
 55 | - Chapter 6 - Progression of RAG systems : Naive to Advanced to Modular [Notebook](./Chapters/Chapter-06/advanced_rag.ipynb.ipynb)
 56 | 
 57 | - Chapter 7 - Evolving RAGOps Stack : Technologies that make RAG possible
 58 | 
 59 | - Chapter 8 - Nuances : Comparison with fine-tuning, multimodal and agentic RAG
 60 | 
 61 | - Chapter 9 - Cutting Edge : Best practices and further exploration
 62 | 
 63 | Why join MEAP?
 64 | ---
 65 | By joining the Manning Early Access Program, you'll get:
 66 | - Immediate access to the book's current draft and all future updates
 67 | - A chance to provide feedback and shape the final content
 68 | - Exclusive discounts and early-bird offers
 69 | 
 70 | Code
 71 | ---
 72 | Code Snippets are organised in the Chapters Directory by [Chapters](./Chapters)
 73 | 
 74 | 
 75 | 
 76 | - Chapter 1 - Does not have any code
 77 | 
 78 | - Chapter 2 - Does not have any code
 79 | 
 80 | - [Chapter 3 - This notebook](./Chapters/Chapter-03/indexing_pipeline.ipynb) outlines the indexing pipeline. 
 81 | 
 82 |     A knowledge base is created for the 2023 Cricket World Cup based on the Wikipedia Article on the topic. We use __AsyncHtmlLoader__ and __Html2TextTransformer__ to load the article, chunk the text using __RecursiveCharacterTextSplitter__, use __text-embedding-3-large__ from OpenAI to convert chunks into vectors and use __FAISS__ as the vector index to store the embeddings.
 83 | 
 84 | - [Chapter 4 - This notebook](./Chapters/Chapter-04/indexing_pipeline.ipynb) outlines the generation pipeline. 
 85 | 
 86 |     We use the knowledge base created in Chapter 03 on the wikipedia article on 2023 Cricket World Cup. We load the __FAISS__ index and use the __similarity search__ function to retrieve chunks. We then Augment the user query with the retrieved chunk and use __GPT 4o__ model from OpenAI to generate the response.
 87 | 
 88 |     This notebook also includes functions that can be used to generate answers for different queries that a user may want to ask.
 89 | 
 90 |     Additionally, this chapter contains [another notebook](./Chapters/Chapter-04/xtra_tfidf_bm25_retriever.ipynb) that shows the usage of TF-IDF and BM25 as retriever algorithms.
 91 | 
 92 | - [Chapter 5 - This notebook](./Chapters/Chapter-05/rag_evaluations.ipynb) evaluates the RAG pipeline created in chapters 3 and 4 using the __RAGAS__ framework.
 93 | 
 94 |     Additionally, this chapter includes a [notebook](./Chapters/Chapter-05/xtra_benchmarking.ipynb) that uses LangChain Benchmarks to benchmark our RAG pipeline on LangChain QnA docs.
 95 | 
 96 | - [Chapter 6 - This notebook](./Chapters/Chapter-06/advanced_rag.ipynb) demonstrates selected Index Optimization, Query Optimization, Retrieval Strategies and Post Retrieval Compression techniques.
 97 | 
 98 | __Note: This is a WIP repository and subsequent chapters will be released on an ongoing basis__
 99 | 
100 | Setup
101 | ---
102 | 
103 | Clone this repository to your local machine:
104 | 
105 | ```
106 | git clone https://github.com/abhinav-kimothi/A-Simple-Introduction-to-RAG.git
107 | ```
108 | 
109 | Navigate to the cloned repository:
110 | 
111 |     
112 |     cd A-Simple-Introduction-to-RAG
113 | 
114 | It's recommended to use a virtual environment to avoid conflicts with other projects or system-wide Python packages.
115 | 
116 | Run the following command to create a virtual environment named myenv (you can name it anything you like):
117 | 
118 | ```
119 | python3 -m venv .myenv
120 | ```
121 | Activate the Virtual Environment:           
122 | 
123 | - On Windows, activate the virtual environment by running:
124 | 
125 | ```
126 |     .myenv\Scripts\activate.bat
127 | ```
128 | 
129 | - On macOS and Linux, activate it with:
130 | 
131 | ```
132 |     source venv/bin/activate
133 | ```
134 | 
135 | Install the package requirements from the requirements.txt by executing the following pip installation command:
136 | 
137 | 
138 | ```
139 | pip install -r requirements.txt
140 | ```
141 | 
142 | _recommended_: Store your API keys in a .env file:
143 | ```
144 | OPENAI_API_KEY=<YOUR API KEY>
145 | LANGCHAIN_API_KEY=<YOUR API KEY>
146 | 
147 | ### You can also look at the .\example_dot_env file in this repo for the structure. Remeber to rename to .env
148 | ```
149 | 
150 | The notebooks in this repository need __python version > 3.11.1__
151 | 
152 | Feedback & Contribution
153 | ---
154 | I'd love to hear what you think about the code here and the book in general. I appreciate and welcome all feedback. You can either post your thoughts, questions, critiques and ideas in the [Discussion forum of this repo](https://github.com/abhinav-kimothi/A-Simple-Introduction-to-RAG/discussions) or if you've purchased the [MEAP](https://mng.bz/8wdg) (https://mng.bz/8wdg) you can also provide your feedback on the [livebook on manning.com](https://livebook.manning.com/book/a-simple-guide-to-retrieval-augmented-generation). 
155 | 
156 | If you notice any errors or issues with the code, please raise an [issue here in the repo](https://github.com/abhinav-kimothi/A-Simple-Introduction-to-RAG/issues)
157 | 
158 | About me
159 | ---
160 | Hi! I'm Abhinav! I am an entrepreneur and Vice President of Artificial Intelligence at Yarnit. I have spent over 15 years consulting and leadership roles in data science, machine learning and AI. My current focus is in the applied Generative AI domain focussing on solving enterprise needs through contextual intelligence. I'm passionate about AI advancements constantly exploring emerging technologies to push the boundaries and create positive impacts in the world. Let’s build the future, together!
161 | 
162 | #### If you'd like to chat, I'd be very happy to connect
163 | 
164 | [![GitHub followers](https://img.shields.io/badge/Github-000000?style=for-the-badge&logo=github&logoColor=black&color=orange)](https://github.com/abhinav-kimothi)
165 | [![LinkedIn](https://img.shields.io/badge/LinkedIn-000000?style=for-the-badge&logo=linkedin&logoColor=orange&color=black)](https://www.linkedin.com/comm/mynetwork/discovery-see-all?usecase=PEOPLE_FOLLOWS&followMember=abhinav-kimothi)
166 | [![Medium](https://img.shields.io/badge/Medium-000000?style=for-the-badge&logo=medium&logoColor=black&color=orange)](https://medium.com/@abhinavkimothi)
167 | [![Insta](https://img.shields.io/badge/Instagram-000000?style=for-the-badge&logo=instagram&logoColor=orange&color=black)](https://www.instagram.com/akaiworks/)
168 | [![Mail](https://img.shields.io/badge/email-000000?style=for-the-badge&logo=gmail&logoColor=black&color=orange)](mailto:abhinav.kimothi.ds@gmail.com)
169 | [![X](https://img.shields.io/badge/Follow-000000?style=for-the-badge&logo=X&logoColor=orange&color=black)](https://twitter.com/abhinav_kimothi)
170 | [![Linktree](https://img.shields.io/badge/Linktree-000000?style=for-the-badge&logo=linktree&logoColor=black&color=orange)](https://linktr.ee/abhinavkimothi)
171 | [![Gumroad](https://img.shields.io/badge/Gumroad-000000?style=for-the-badge&logo=gumroad&logoColor=orange&color=black)](https://abhinavkimothi.gumroad.com/)
172 | 
173 | 
174 | [If you haven't already, please subscribe to the MEAP of A Simple Guide to Retrieval Augmented Generation here](https://mng.bz/8wdg)
175 | 
176 | <a href="https://mng.bz/8wdg" target="_blank">
177 |     <img src="./Assets/Images/NewMEAPFooter.png" alt="New MEAP" style="width: 100%;" />
178 | </a>
179 | 
180 | ---
181 | 
182 | 
183 | 
184 | 
185 | 
186 | 
187 | 
188 | 
189 | 
190 | 
191 | 
192 | 
193 | 
194 | 
195 | 


--------------------------------------------------------------------------------
/example_dot_env:
--------------------------------------------------------------------------------
1 | OPENAI_API_KEY=sk-proj-******
2 | LANGCHAIN_API_KEY=lsv2_******


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | langchain==0.3.19
 2 | langchain_community==0.3.18
 3 | bs4==0.0.2
 4 | html2text==2024.2.26
 5 | matplotlib==3.10.0
 6 | lxml==5.3.1
 7 | langchain_huggingface==0.1.2
 8 | openai==1.64.0
 9 | langchain-openai==0.3.7
10 | faiss-cpu==1.10.0
11 | python-dotenv==1.0.1
12 | scikit-learn==1.6.1
13 | rank_bm25==0.2.2
14 | ragas==0.2.13
15 | rapidfuzz==3.12.1


--------------------------------------------------------------------------------