├── .gitignore ├── .python-version ├── 01_simple_rag.ipynb ├── 02_embedding_model.ipynb ├── 03_semantic_chunking.ipynb ├── 04_contextual_retrieval.ipynb ├── 05_reverse_hyde.ipynb ├── 06_hybrid_search.ipynb ├── 07_reranking.ipynb ├── 08_multimodal_pdf.ipynb ├── LICENSE ├── README.md ├── data ├── corpus.json ├── dense_results.json ├── shokz │ └── OpenRun Pro User Guide.pdf ├── sparse_results.json └── top_rated_wines.csv ├── exercise ├── 01_simple_rag_exercise.ipynb ├── 02_advanced_chunking_exercise.ipynb ├── 03_hybrid_search_exercise.ipynb └── random_page.png ├── images ├── Advanced_RAG.png ├── Hybrid_Search.png ├── Naiive_RAG.png ├── Recall_Precision_in_RAG_Diagram.png └── advanced-rag-setup.gif ├── pyproject.toml ├── requirements.in ├── requirements.txt ├── themes ├── dark.theme └── light.theme └── uv.lock /.gitignore: -------------------------------------------------------------------------------- 1 | # Jupyter Notebook 2 | .ipynb_checkpoints 3 | *.ipynb_checkpoints/ 4 | 5 | # Python 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | 29 | # Sphinx documentation 30 | docs/_build/ 31 | 32 | # PyBuilder 33 | target/ 34 | 35 | # Environments 36 | .env 37 | .venv 38 | env/ 39 | venv/ 40 | ENV/ 41 | env.bak/ 42 | venv.bak/ 43 | 44 | # OS generated files 45 | .DS_Store 46 | .DS_Store? 47 | ._* 48 | .Spotlight-V100 49 | .Trashes 50 | ehthumbs.db 51 | Thumbs.db 52 | .vscode/spellright.dict 53 | -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | pypy@3.11 2 | -------------------------------------------------------------------------------- /04_contextual_retrieval.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Enhancing RAG with Contextual Retrieval\n", 8 | "\n", 9 | "We will use an LLM to generate for each chunk and document a contextual sentence to improve its retrival accuracy and use in hybrid search.\n", 10 | "\n", 11 | "* [Load complex documents dataset](#loading-a-complex-dataset-of-documents)\n", 12 | "* [Split the documents into chunks](#split-the-documents-into-chunks)\n", 13 | "* [Generate the context sentence](#generate-the-context-sentence)\n", 14 | "* [Enrich the chunk embedding vectors with the context](#enrich-the-chunk-embedding-vectors-with-the-context)" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "### Visual improvements\n", 22 | "\n", 23 | "We will use [rich library](https://github.com/Textualize/rich) to make the output more readable, and supress warning messages." 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 1, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "from rich.console import Console\n", 33 | "from rich_theme_manager import Theme, ThemeManager\n", 34 | "import pathlib\n", 35 | "\n", 36 | "theme_dir = pathlib.Path(\"themes\")\n", 37 | "theme_manager = ThemeManager(theme_dir=theme_dir)\n", 38 | "dark = theme_manager.get(\"dark\")\n", 39 | "\n", 40 | "# Create a console with the dark theme\n", 41 | "console = Console(theme=dark)" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 2, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "import warnings\n", 51 | "\n", 52 | "# Suppress warnings\n", 53 | "warnings.filterwarnings('ignore')" 54 | ] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "## Loading a complex dataset of documents\n", 61 | "\n", 62 | "We will load a complex dataset of scientific documents from Arxiv. Applying naive chunks on such documents will give poor results in RAG applications." 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 3, 68 | "metadata": {}, 69 | "outputs": [ 70 | { 71 | "data": { 72 | "text/html": [ 73 | "
Dataset({\n",
 74 |        "    features: ['id', 'title', 'summary', 'source', 'authors', 'categories', 'comment', 'journal_ref', \n",
 75 |        "'primary_category', 'published', 'updated', 'content', 'references'],\n",
 76 |        "    num_rows: 2673\n",
 77 |        "})\n",
 78 |        "
\n" 79 | ], 80 | "text/plain": [ 81 | "\u001b[93mDataset\u001b[0m\u001b[1m(\u001b[0m\u001b[1m{\u001b[0m\n", 82 | " features: \u001b[1m[\u001b[0m\u001b[92m'id'\u001b[0m, \u001b[92m'title'\u001b[0m, \u001b[92m'summary'\u001b[0m, \u001b[92m'source'\u001b[0m, \u001b[92m'authors'\u001b[0m, \u001b[92m'categories'\u001b[0m, \u001b[92m'comment'\u001b[0m, \u001b[92m'journal_ref'\u001b[0m, \n", 83 | "\u001b[92m'primary_category'\u001b[0m, \u001b[92m'published'\u001b[0m, \u001b[92m'updated'\u001b[0m, \u001b[92m'content'\u001b[0m, \u001b[92m'references'\u001b[0m\u001b[1m]\u001b[0m,\n", 84 | " num_rows: \u001b[91m2673\u001b[0m\n", 85 | "\u001b[1m}\u001b[0m\u001b[1m)\u001b[0m\n" 86 | ] 87 | }, 88 | "metadata": {}, 89 | "output_type": "display_data" 90 | } 91 | ], 92 | "source": [ 93 | "from datasets import load_dataset\n", 94 | "\n", 95 | "dataset = load_dataset(\"jamescalam/ai-arxiv2\", split=\"train\")\n", 96 | "console.print(dataset)" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": {}, 102 | "source": [ 103 | "## Split the documents into Chunks\n", 104 | "\n", 105 | "We will use the statistical chunker that we used in a previous notebook." 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 4, 111 | "metadata": {}, 112 | "outputs": [ 113 | { 114 | "data": { 115 | "text/plain": [ 116 | "True" 117 | ] 118 | }, 119 | "execution_count": 4, 120 | "metadata": {}, 121 | "output_type": "execute_result" 122 | } 123 | ], 124 | "source": [ 125 | "from dotenv import load_dotenv\n", 126 | "\n", 127 | "load_dotenv()" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 5, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "import os\n", 137 | "from semantic_router.encoders import OpenAIEncoder\n", 138 | "\n", 139 | "encoder = OpenAIEncoder(name=\"text-embedding-3-small\")" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 6, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "from semantic_chunkers import StatisticalChunker\n", 149 | "import logging\n", 150 | "\n", 151 | "logging.disable(logging.CRITICAL)\n", 152 | "\n", 153 | "chunker = StatisticalChunker(\n", 154 | " encoder=encoder,\n", 155 | " min_split_tokens=100,\n", 156 | " max_split_tokens=500,\n", 157 | ")" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 7, 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [ 166 | "chunks_0 = chunker(docs=[dataset[\"content\"][0]])\n" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": 8, 172 | "metadata": {}, 173 | "outputs": [ 174 | { 175 | "data": { 176 | "text/html": [ 177 | "
╭──────────────────────────────────────────────────── Chunk 0 ────────────────────────────────────────────────────╮\n",
178 |        " 4 2 0 2 n a J 8 ] G L . s c [ 1 v 8 8 0 4 0 . 1 0 4 2 : v i X r a # Mixtral of Experts Albert Q. Jiang,         \n",
179 |        " Alexandre Sablayrolles, Antoine Roux, Arthur Mensch, Blanche Savary, Chris Bamford, Devendra Singh Chaplot,     \n",
180 |        " Diego de las Casas, Emma Bou Hanna, Florian Bressand, Gianna Lengyel, Guillaume Bour, Guillaume Lample, Lélio  \n",
181 |        " Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Sandeep Subramanian, Sophia Yang, Szymon      \n",
182 |        " Antoniak, Teven Le Scao, Théophile Gervet, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed    \n",
183 |        " Abstract We introduce Mixtral 8x7B, a Sparse Mixture of Experts (SMoE) language model. Mixtral has the same     \n",
184 |        " architecture as Mistral 7B, with the difference that each layer is composed of 8 feedforward blocks (i.e.       \n",
185 |        " experts). For every token, at each layer, a router network selects two experts to process the current state and \n",
186 |        " combine their outputs. Even though each token only sees two experts, the selected experts can be different at   \n",
187 |        " each timestep. As a result, each token has access to 47B parameters, but only uses 13B active parameters during \n",
188 |        " inference. Mixtral was trained with a context size of 32k tokens and it outperforms or matches Llama 2 70B and  \n",
189 |        " GPT-3.5 across all evaluated benchmarks. In particular, Mixtral vastly outperforms Llama 2 70B on mathematics,  \n",
190 |        " code generation, and multilingual benchmarks. We also provide a model fine- tuned to follow instructions,       \n",
191 |        " Mixtral 8x7B â Instruct, that surpasses GPT-3.5 Turbo, Claude-2.1, Gemini Pro, and Llama 2 70B â                \n",
192 |        "╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
193 |        "
\n" 194 | ], 195 | "text/plain": [ 196 | "\u001b[1m╭─\u001b[0m\u001b[1m───────────────────────────────────────────────────\u001b[0m\u001b[1m Chunk 0 \u001b[0m\u001b[1m───────────────────────────────────────────────────\u001b[0m\u001b[1m─╮\u001b[0m\n", 197 | "\u001b[1m│\u001b[0m 4 2 0 2 n a J 8 ] G L . s c [ 1 v 8 8 0 4 0 . 1 0 4 2 : v i X r a # Mixtral of Experts Albert Q. Jiang, \u001b[1m│\u001b[0m\n", 198 | "\u001b[1m│\u001b[0m Alexandre Sablayrolles, Antoine Roux, Arthur Mensch, Blanche Savary, Chris Bamford, Devendra Singh Chaplot, \u001b[1m│\u001b[0m\n", 199 | "\u001b[1m│\u001b[0m Diego de las Casas, Emma Bou Hanna, Florian Bressand, Gianna Lengyel, Guillaume Bour, Guillaume Lample, Lélio \u001b[1m│\u001b[0m\n", 200 | "\u001b[1m│\u001b[0m Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Sandeep Subramanian, Sophia Yang, Szymon \u001b[1m│\u001b[0m\n", 201 | "\u001b[1m│\u001b[0m Antoniak, Teven Le Scao, Théophile Gervet, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed \u001b[1m│\u001b[0m\n", 202 | "\u001b[1m│\u001b[0m Abstract We introduce Mixtral 8x7B, a Sparse Mixture of Experts (SMoE) language model. Mixtral has the same \u001b[1m│\u001b[0m\n", 203 | "\u001b[1m│\u001b[0m architecture as Mistral 7B, with the difference that each layer is composed of 8 feedforward blocks (i.e. \u001b[1m│\u001b[0m\n", 204 | "\u001b[1m│\u001b[0m experts). For every token, at each layer, a router network selects two experts to process the current state and \u001b[1m│\u001b[0m\n", 205 | "\u001b[1m│\u001b[0m combine their outputs. Even though each token only sees two experts, the selected experts can be different at \u001b[1m│\u001b[0m\n", 206 | "\u001b[1m│\u001b[0m each timestep. As a result, each token has access to 47B parameters, but only uses 13B active parameters during \u001b[1m│\u001b[0m\n", 207 | "\u001b[1m│\u001b[0m inference. Mixtral was trained with a context size of 32k tokens and it outperforms or matches Llama 2 70B and \u001b[1m│\u001b[0m\n", 208 | "\u001b[1m│\u001b[0m GPT-3.5 across all evaluated benchmarks. In particular, Mixtral vastly outperforms Llama 2 70B on mathematics, \u001b[1m│\u001b[0m\n", 209 | "\u001b[1m│\u001b[0m code generation, and multilingual benchmarks. We also provide a model fine- tuned to follow instructions, \u001b[1m│\u001b[0m\n", 210 | "\u001b[1m│\u001b[0m Mixtral 8x7B â Instruct, that surpasses GPT-3.5 Turbo, Claude-2.1, Gemini Pro, and Llama 2 70B â \u001b[1m│\u001b[0m\n", 211 | "\u001b[1m╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n" 212 | ] 213 | }, 214 | "metadata": {}, 215 | "output_type": "display_data" 216 | } 217 | ], 218 | "source": [ 219 | "from rich.text import Text\n", 220 | "from rich.panel import Panel\n", 221 | "\n", 222 | "chunk_0_0 = ' '.join(chunks_0[0][0].splits)\n", 223 | "\n", 224 | "content = Text(chunk_0_0)\n", 225 | "console.print(Panel(content, title=f\"Chunk 0\", expand=False, border_style=\"bold\"))" 226 | ] 227 | }, 228 | { 229 | "cell_type": "markdown", 230 | "metadata": {}, 231 | "source": [ 232 | "## Generate the context sentence\n", 233 | "\n", 234 | "We will use Anthropic Claude for the generation of the context. It is one of the best summarization LLM, and it introduced the [Prompt Caching](https://www.anthropic.com/news/prompt-caching) that is great for the generation of the context for many chunks of the same document." 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 9, 240 | "metadata": {}, 241 | "outputs": [ 242 | { 243 | "data": { 244 | "text/plain": [ 245 | "True" 246 | ] 247 | }, 248 | "execution_count": 9, 249 | "metadata": {}, 250 | "output_type": "execute_result" 251 | } 252 | ], 253 | "source": [ 254 | "from dotenv import load_dotenv\n", 255 | "\n", 256 | "load_dotenv()" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": 10, 262 | "metadata": {}, 263 | "outputs": [], 264 | "source": [ 265 | "import anthropic\n", 266 | "\n", 267 | "client = anthropic.Anthropic()\n" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": 11, 273 | "metadata": {}, 274 | "outputs": [], 275 | "source": [ 276 | "DOCUMENT_CONTEXT_PROMPT = \"\"\"\n", 277 | "\n", 278 | "{doc_content}\n", 279 | "\n", 280 | "\"\"\"\n", 281 | "\n", 282 | "CHUNK_CONTEXT_PROMPT = \"\"\"\n", 283 | "Here is the chunk we want to situate within the whole document\n", 284 | "\n", 285 | "{chunk_content}\n", 286 | "\n", 287 | "\n", 288 | "Please give a short succinct context to situate this chunk within the overall document for the purposes of improving search retrieval of the chunk.\n", 289 | "Answer only with the succinct context and nothing else.\n", 290 | "\"\"\"\n", 291 | "\n", 292 | "def situate_context(doc: str, chunk: str) -> str:\n", 293 | " response = client.beta.prompt_caching.messages.create(\n", 294 | " model=\"claude-3-haiku-20240307\",\n", 295 | " max_tokens=1024,\n", 296 | " temperature=0.0,\n", 297 | " messages=[\n", 298 | " {\n", 299 | " \"role\": \"user\", \n", 300 | " \"content\": [\n", 301 | " {\n", 302 | " \"type\": \"text\",\n", 303 | " \"text\": DOCUMENT_CONTEXT_PROMPT.format(doc_content=doc),\n", 304 | " \"cache_control\": {\"type\": \"ephemeral\"} #we will make use of prompt caching for the full documents\n", 305 | " },\n", 306 | " {\n", 307 | " \"type\": \"text\",\n", 308 | " \"text\": CHUNK_CONTEXT_PROMPT.format(chunk_content=chunk),\n", 309 | " }\n", 310 | " ]\n", 311 | " }\n", 312 | " ],\n", 313 | " extra_headers={\"anthropic-beta\": \"prompt-caching-2024-07-31\"}\n", 314 | " )\n", 315 | " return response" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": 12, 321 | "metadata": {}, 322 | "outputs": [], 323 | "source": [ 324 | "chunk_context = situate_context(dataset[\"content\"][0], chunk_0_0)" 325 | ] 326 | }, 327 | { 328 | "cell_type": "code", 329 | "execution_count": 13, 330 | "metadata": {}, 331 | "outputs": [ 332 | { 333 | "data": { 334 | "text/html": [ 335 | "
PromptCachingBetaMessage(\n",
336 |        "    id='msg_01Lg8AiYwn7wHDFAsWGsjhGx',\n",
337 |        "    content=[\n",
338 |        "        TextBlock(\n",
339 |        "            text='This chunk introduces Mixtral 8x7B, a sparse mixture of experts language model that outperforms \n",
340 |        "Llama 2 70B and GPT-3.5 on various benchmarks. It also describes the model architecture and the fine-tuned Mixtral \n",
341 |        "8x7B - Instruct model.',\n",
342 |        "            type='text'\n",
343 |        "        )\n",
344 |        "    ],\n",
345 |        "    model='claude-3-haiku-20240307',\n",
346 |        "    role='assistant',\n",
347 |        "    stop_reason='end_turn',\n",
348 |        "    stop_sequence=None,\n",
349 |        "    type='message',\n",
350 |        "    usage=PromptCachingBetaUsage(\n",
351 |        "        cache_creation_input_tokens=0,\n",
352 |        "        cache_read_input_tokens=12532,\n",
353 |        "        input_tokens=584,\n",
354 |        "        output_tokens=73\n",
355 |        "    )\n",
356 |        ")\n",
357 |        "
\n" 358 | ], 359 | "text/plain": [ 360 | "\u001b[93mPromptCachingBetaMessage\u001b[0m\u001b[1m(\u001b[0m\n", 361 | " \u001b[1;38;2;232;125;62mid\u001b[0m=\u001b[92m'msg_01Lg8AiYwn7wHDFAsWGsjhGx'\u001b[0m,\n", 362 | " \u001b[1;38;2;232;125;62mcontent\u001b[0m=\u001b[1m[\u001b[0m\n", 363 | " \u001b[93mTextBlock\u001b[0m\u001b[1m(\u001b[0m\n", 364 | " \u001b[1;38;2;232;125;62mtext\u001b[0m=\u001b[92m'This chunk introduces Mixtral 8x7B, a sparse mixture of experts language model that outperforms \u001b[0m\n", 365 | "\u001b[92mLlama 2 70B and GPT-3.5 on various benchmarks. It also describes the model architecture and the fine-tuned Mixtral \u001b[0m\n", 366 | "\u001b[92m8x7B - Instruct model.'\u001b[0m,\n", 367 | " \u001b[1;38;2;232;125;62mtype\u001b[0m=\u001b[92m'text'\u001b[0m\n", 368 | " \u001b[1m)\u001b[0m\n", 369 | " \u001b[1m]\u001b[0m,\n", 370 | " \u001b[1;38;2;232;125;62mmodel\u001b[0m=\u001b[92m'claude-3-haiku-20240307'\u001b[0m,\n", 371 | " \u001b[1;38;2;232;125;62mrole\u001b[0m=\u001b[92m'assistant'\u001b[0m,\n", 372 | " \u001b[1;38;2;232;125;62mstop_reason\u001b[0m=\u001b[92m'end_turn'\u001b[0m,\n", 373 | " \u001b[1;38;2;232;125;62mstop_sequence\u001b[0m=\u001b[2;37mNone\u001b[0m,\n", 374 | " \u001b[1;38;2;232;125;62mtype\u001b[0m=\u001b[92m'message'\u001b[0m,\n", 375 | " \u001b[1;38;2;232;125;62musage\u001b[0m=\u001b[93mPromptCachingBetaUsage\u001b[0m\u001b[1m(\u001b[0m\n", 376 | " \u001b[1;38;2;232;125;62mcache_creation_input_tokens\u001b[0m=\u001b[91m0\u001b[0m,\n", 377 | " \u001b[1;38;2;232;125;62mcache_read_input_tokens\u001b[0m=\u001b[91m12532\u001b[0m,\n", 378 | " \u001b[1;38;2;232;125;62minput_tokens\u001b[0m=\u001b[91m584\u001b[0m,\n", 379 | " \u001b[1;38;2;232;125;62moutput_tokens\u001b[0m=\u001b[91m73\u001b[0m\n", 380 | " \u001b[1m)\u001b[0m\n", 381 | "\u001b[1m)\u001b[0m\n" 382 | ] 383 | }, 384 | "metadata": {}, 385 | "output_type": "display_data" 386 | } 387 | ], 388 | "source": [ 389 | "console.print(chunk_context)" 390 | ] 391 | }, 392 | { 393 | "cell_type": "code", 394 | "execution_count": 14, 395 | "metadata": {}, 396 | "outputs": [], 397 | "source": [ 398 | "chunk_0_5 = ' '.join(chunks_0[0][5].splits)" 399 | ] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "execution_count": 15, 404 | "metadata": {}, 405 | "outputs": [], 406 | "source": [ 407 | "second_chunk_context = situate_context(dataset[\"content\"][0], chunk_0_5)" 408 | ] 409 | }, 410 | { 411 | "cell_type": "code", 412 | "execution_count": 16, 413 | "metadata": {}, 414 | "outputs": [ 415 | { 416 | "data": { 417 | "text/html": [ 418 | "
PromptCachingBetaMessage(\n",
419 |        "    id='msg_01Ri5m7g6sH1usc5WYs64td7',\n",
420 |        "    content=[\n",
421 |        "        TextBlock(\n",
422 |        "            text='This chunk describes the architectural details of the Mixtral model, specifically the Sparse \n",
423 |        "Mixture of Experts layer that is a key component of the model.',\n",
424 |        "            type='text'\n",
425 |        "        )\n",
426 |        "    ],\n",
427 |        "    model='claude-3-haiku-20240307',\n",
428 |        "    role='assistant',\n",
429 |        "    stop_reason='end_turn',\n",
430 |        "    stop_sequence=None,\n",
431 |        "    type='message',\n",
432 |        "    usage=PromptCachingBetaUsage(\n",
433 |        "        cache_creation_input_tokens=0,\n",
434 |        "        cache_read_input_tokens=12532,\n",
435 |        "        input_tokens=221,\n",
436 |        "        output_tokens=35\n",
437 |        "    )\n",
438 |        ")\n",
439 |        "
\n" 440 | ], 441 | "text/plain": [ 442 | "\u001b[93mPromptCachingBetaMessage\u001b[0m\u001b[1m(\u001b[0m\n", 443 | " \u001b[1;38;2;232;125;62mid\u001b[0m=\u001b[92m'msg_01Ri5m7g6sH1usc5WYs64td7'\u001b[0m,\n", 444 | " \u001b[1;38;2;232;125;62mcontent\u001b[0m=\u001b[1m[\u001b[0m\n", 445 | " \u001b[93mTextBlock\u001b[0m\u001b[1m(\u001b[0m\n", 446 | " \u001b[1;38;2;232;125;62mtext\u001b[0m=\u001b[92m'This chunk describes the architectural details of the Mixtral model, specifically the Sparse \u001b[0m\n", 447 | "\u001b[92mMixture of Experts layer that is a key component of the model.'\u001b[0m,\n", 448 | " \u001b[1;38;2;232;125;62mtype\u001b[0m=\u001b[92m'text'\u001b[0m\n", 449 | " \u001b[1m)\u001b[0m\n", 450 | " \u001b[1m]\u001b[0m,\n", 451 | " \u001b[1;38;2;232;125;62mmodel\u001b[0m=\u001b[92m'claude-3-haiku-20240307'\u001b[0m,\n", 452 | " \u001b[1;38;2;232;125;62mrole\u001b[0m=\u001b[92m'assistant'\u001b[0m,\n", 453 | " \u001b[1;38;2;232;125;62mstop_reason\u001b[0m=\u001b[92m'end_turn'\u001b[0m,\n", 454 | " \u001b[1;38;2;232;125;62mstop_sequence\u001b[0m=\u001b[2;37mNone\u001b[0m,\n", 455 | " \u001b[1;38;2;232;125;62mtype\u001b[0m=\u001b[92m'message'\u001b[0m,\n", 456 | " \u001b[1;38;2;232;125;62musage\u001b[0m=\u001b[93mPromptCachingBetaUsage\u001b[0m\u001b[1m(\u001b[0m\n", 457 | " \u001b[1;38;2;232;125;62mcache_creation_input_tokens\u001b[0m=\u001b[91m0\u001b[0m,\n", 458 | " \u001b[1;38;2;232;125;62mcache_read_input_tokens\u001b[0m=\u001b[91m12532\u001b[0m,\n", 459 | " \u001b[1;38;2;232;125;62minput_tokens\u001b[0m=\u001b[91m221\u001b[0m,\n", 460 | " \u001b[1;38;2;232;125;62moutput_tokens\u001b[0m=\u001b[91m35\u001b[0m\n", 461 | " \u001b[1m)\u001b[0m\n", 462 | "\u001b[1m)\u001b[0m\n" 463 | ] 464 | }, 465 | "metadata": {}, 466 | "output_type": "display_data" 467 | } 468 | ], 469 | "source": [ 470 | "console.print(second_chunk_context)" 471 | ] 472 | }, 473 | { 474 | "cell_type": "markdown", 475 | "metadata": {}, 476 | "source": [ 477 | "## Enrich the chunk embedding vectors with the context" 478 | ] 479 | }, 480 | { 481 | "cell_type": "markdown", 482 | "metadata": {}, 483 | "source": [ 484 | "### Concatenate the generated context to the chunk text\n", 485 | "\n", 486 | "We will iterate over all the chunks. This can take some time based on the number of chunks." 487 | ] 488 | }, 489 | { 490 | "cell_type": "code", 491 | "execution_count": 17, 492 | "metadata": {}, 493 | "outputs": [ 494 | { 495 | "name": "stderr", 496 | "output_type": "stream", 497 | "text": [ 498 | "Processing chunks: 100%|██████████| 46/46 [26:50<00:00, 35.00s/it] \n" 499 | ] 500 | } 501 | ], 502 | "source": [ 503 | "arxiv_id = dataset[0][\"id\"]\n", 504 | "refs = list(dataset[0][\"references\"].values())\n", 505 | "doc_text = dataset[0][\"content\"]\n", 506 | "title = dataset[0][\"title\"]\n", 507 | "\n", 508 | "from tqdm import tqdm\n", 509 | "\n", 510 | "corpus_json = []\n", 511 | "for i, chunk in tqdm(enumerate(chunks_0[0]), total=len(chunks_0[0]), desc=\"Processing chunks\"):\n", 512 | " chunk_text = ' '.join(chunk.splits)\n", 513 | " contextualized_text = situate_context(doc_text, chunk_text).content[0].text\n", 514 | " corpus_json.append({\n", 515 | " \"id\": i,\n", 516 | " \"text\": f\"{chunk_text}\\n\\n{contextualized_text}\",\n", 517 | " \"metadata\" : {\n", 518 | " \"title\": title,\n", 519 | " \"arxiv_id\": arxiv_id,\n", 520 | " \"references\": refs\n", 521 | " }\n", 522 | " })" 523 | ] 524 | }, 525 | { 526 | "cell_type": "code", 527 | "execution_count": 18, 528 | "metadata": {}, 529 | "outputs": [ 530 | { 531 | "data": { 532 | "text/html": [ 533 | "
[\n",
534 |        "    {\n",
535 |        "        'id': 0,\n",
536 |        "        'text': '4 2 0 2 n a J 8 ] G L . s c [ 1 v 8 8 0 4 0 . 1 0 4 2 : v i X r a # Mixtral of Experts Albert Q. \n",
537 |        "Jiang, Alexandre Sablayrolles, Antoine Roux, Arthur Mensch, Blanche Savary, Chris Bamford, Devendra Singh Chaplot, \n",
538 |        "Diego de las Casas, Emma Bou Hanna, Florian Bressand, Gianna Lengyel, Guillaume Bour, Guillaume Lample, Lélio \n",
539 |        "Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Sandeep Subramanian, Sophia Yang, Szymon \n",
540 |        "Antoniak, Teven Le Scao, Théophile Gervet, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed \n",
541 |        "Abstract We introduce Mixtral 8x7B, a Sparse Mixture of Experts (SMoE) language model. Mixtral has the same \n",
542 |        "architecture as Mistral 7B, with the difference that each layer is composed of 8 feedforward blocks (i.e. experts).\n",
543 |        "For every token, at each layer, a router network selects two experts to process the current state and combine their\n",
544 |        "outputs. Even though each token only sees two experts, the selected experts can be different at each timestep. As a\n",
545 |        "result, each token has access to 47B parameters, but only uses 13B active parameters during inference. Mixtral was \n",
546 |        "trained with a context size of 32k tokens and it outperforms or matches Llama 2 70B and GPT-3.5 across all \n",
547 |        "evaluated benchmarks. In particular, Mixtral vastly outperforms Llama 2 70B on mathematics, code generation, and \n",
548 |        "multilingual benchmarks. We also provide a model fine- tuned to follow instructions, Mixtral 8x7B â Instruct, that \n",
549 |        "surpasses GPT-3.5 Turbo, Claude-2.1, Gemini Pro, and Llama 2 70B â\\n\\nThis chunk introduces Mixtral 8x7B, a sparse \n",
550 |        "mixture of experts language model that outperforms Llama 2 70B and GPT-3.5 on various benchmarks. It also describes\n",
551 |        "the model architecture and the fine-tuned Mixtral 8x7B - Instruct model.',\n",
552 |        "        'metadata': {'title': 'Mixtral of Experts', 'arxiv_id': '2401.04088', 'references': ['1905.07830']}\n",
553 |        "    },\n",
554 |        "    {\n",
555 |        "        'id': 1,\n",
556 |        "        'text': 'chat model on human bench- marks. Both the base and instruct models are released under the Apache \n",
557 |        "2.0 license. Code: https://github.com/mistralai/mistral-src Webpage: https://mistral.ai/news/mixtral-of-experts/ # \n",
558 |        "Introduction In this paper, we present Mixtral 8x7B, a sparse mixture of experts model (SMoE) with open weights, \n",
559 |        "licensed under Apache 2.0. Mixtral outperforms Llama 2 70B and GPT-3.5 on most benchmarks. As it only uses a subset\n",
560 |        "of its parameters for every token, Mixtral allows faster inference speed at low batch-sizes, and higher throughput \n",
561 |        "at large batch-sizes. Mixtral is a sparse mixture-of-experts network. It is a decoder-only model where the \n",
562 |        "feedforward block picks from a set of 8 distinct groups of parameters. At every layer, for every token, a router \n",
563 |        "network chooses two of these groups (the â\\n\\nThis chunk introduces Mixtral 8x7B, a sparse mixture of experts \n",
564 |        "language model that outperforms Llama 2 70B and GPT-3.5 on most benchmarks. It describes the key architectural \n",
565 |        "details of Mixtral, including its use of a sparse mixture-of-experts network, and mentions that the base and \n",
566 |        "instruct models are released under the Apache 2.0 license.',\n",
567 |        "        'metadata': {'title': 'Mixtral of Experts', 'arxiv_id': '2401.04088', 'references': ['1905.07830']}\n",
568 |        "    }\n",
569 |        "]\n",
570 |        "
\n" 571 | ], 572 | "text/plain": [ 573 | "\u001b[1m[\u001b[0m\n", 574 | " \u001b[1m{\u001b[0m\n", 575 | " \u001b[92m'id'\u001b[0m: \u001b[91m0\u001b[0m,\n", 576 | " \u001b[92m'text'\u001b[0m: \u001b[92m'4 2 0 2 n a J 8 \u001b[0m\u001b[1;92m]\u001b[0m\u001b[92m G L . s c \u001b[0m\u001b[1;92m[\u001b[0m\u001b[92m 1 v 8 8 0 4 0 . 1 0 4 2 : v i X r a # Mixtral of Experts Albert Q. \u001b[0m\n", 577 | "\u001b[92mJiang, Alexandre Sablayrolles, Antoine Roux, Arthur Mensch, Blanche Savary, Chris Bamford, Devendra Singh Chaplot, \u001b[0m\n", 578 | "\u001b[92mDiego de las Casas, Emma Bou Hanna, Florian Bressand, Gianna Lengyel, Guillaume Bour, Guillaume Lample, Lélio \u001b[0m\n", 579 | "\u001b[92mRenard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Sandeep Subramanian, Sophia Yang, Szymon \u001b[0m\n", 580 | "\u001b[92mAntoniak, Teven Le Scao, Théophile Gervet, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed \u001b[0m\n", 581 | "\u001b[92mAbstract We introduce Mixtral 8x7B, a Sparse Mixture of Experts \u001b[0m\u001b[1;92m(\u001b[0m\u001b[92mSMoE\u001b[0m\u001b[1;92m)\u001b[0m\u001b[92m language model. Mixtral has the same \u001b[0m\n", 582 | "\u001b[92marchitecture as Mistral 7B, with the difference that each layer is composed of 8 feedforward blocks \u001b[0m\u001b[1;92m(\u001b[0m\u001b[92mi.e. experts\u001b[0m\u001b[1;92m)\u001b[0m\u001b[92m.\u001b[0m\n", 583 | "\u001b[92mFor every token, at each layer, a router network selects two experts to process the current state and combine their\u001b[0m\n", 584 | "\u001b[92moutputs. Even though each token only sees two experts, the selected experts can be different at each timestep. As a\u001b[0m\n", 585 | "\u001b[92mresult, each token has access to 47B parameters, but only uses 13B active parameters during inference. Mixtral was \u001b[0m\n", 586 | "\u001b[92mtrained with a context size of 32k tokens and it outperforms or matches Llama 2 70B and GPT-3.5 across all \u001b[0m\n", 587 | "\u001b[92mevaluated benchmarks. In particular, Mixtral vastly outperforms Llama 2 70B on mathematics, code generation, and \u001b[0m\n", 588 | "\u001b[92mmultilingual benchmarks. We also provide a model fine- tuned to follow instructions, Mixtral 8x7B â Instruct, that \u001b[0m\n", 589 | "\u001b[92msurpasses GPT-3.5 Turbo, Claude-2.1, Gemini Pro, and Llama 2 70B â\\n\\nThis chunk introduces Mixtral 8x7B, a sparse \u001b[0m\n", 590 | "\u001b[92mmixture of experts language model that outperforms Llama 2 70B and GPT-3.5 on various benchmarks. It also describes\u001b[0m\n", 591 | "\u001b[92mthe model architecture and the fine-tuned Mixtral 8x7B - Instruct model.'\u001b[0m,\n", 592 | " \u001b[92m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\u001b[92m'title'\u001b[0m: \u001b[92m'Mixtral of Experts'\u001b[0m, \u001b[92m'arxiv_id'\u001b[0m: \u001b[92m'2401.04088'\u001b[0m, \u001b[92m'references'\u001b[0m: \u001b[1m[\u001b[0m\u001b[92m'1905.07830'\u001b[0m\u001b[1m]\u001b[0m\u001b[1m}\u001b[0m\n", 593 | " \u001b[1m}\u001b[0m,\n", 594 | " \u001b[1m{\u001b[0m\n", 595 | " \u001b[92m'id'\u001b[0m: \u001b[91m1\u001b[0m,\n", 596 | " \u001b[92m'text'\u001b[0m: \u001b[92m'chat model on human bench- marks. Both the base and instruct models are released under the Apache \u001b[0m\n", 597 | "\u001b[92m2.0 license. Code: https://github.com/mistralai/mistral-src Webpage: https://mistral.ai/news/mixtral-of-experts/ # \u001b[0m\n", 598 | "\u001b[92mIntroduction In this paper, we present Mixtral 8x7B, a sparse mixture of experts model \u001b[0m\u001b[1;92m(\u001b[0m\u001b[92mSMoE\u001b[0m\u001b[1;92m)\u001b[0m\u001b[92m with open weights, \u001b[0m\n", 599 | "\u001b[92mlicensed under Apache 2.0. Mixtral outperforms Llama 2 70B and GPT-3.5 on most benchmarks. As it only uses a subset\u001b[0m\n", 600 | "\u001b[92mof its parameters for every token, Mixtral allows faster inference speed at low batch-sizes, and higher throughput \u001b[0m\n", 601 | "\u001b[92mat large batch-sizes. Mixtral is a sparse mixture-of-experts network. It is a decoder-only model where the \u001b[0m\n", 602 | "\u001b[92mfeedforward block picks from a set of 8 distinct groups of parameters. At every layer, for every token, a router \u001b[0m\n", 603 | "\u001b[92mnetwork chooses two of these groups \u001b[0m\u001b[1;92m(\u001b[0m\u001b[92mthe â\\n\\nThis chunk introduces Mixtral 8x7B, a sparse mixture of experts \u001b[0m\n", 604 | "\u001b[92mlanguage model that outperforms Llama 2 70B and GPT-3.5 on most benchmarks. It describes the key architectural \u001b[0m\n", 605 | "\u001b[92mdetails of Mixtral, including its use of a sparse mixture-of-experts network, and mentions that the base and \u001b[0m\n", 606 | "\u001b[92minstruct models are released under the Apache 2.0 license.'\u001b[0m,\n", 607 | " \u001b[92m'metadata'\u001b[0m: \u001b[1m{\u001b[0m\u001b[92m'title'\u001b[0m: \u001b[92m'Mixtral of Experts'\u001b[0m, \u001b[92m'arxiv_id'\u001b[0m: \u001b[92m'2401.04088'\u001b[0m, \u001b[92m'references'\u001b[0m: \u001b[1m[\u001b[0m\u001b[92m'1905.07830'\u001b[0m\u001b[1m]\u001b[0m\u001b[1m}\u001b[0m\n", 608 | " \u001b[1m}\u001b[0m\n", 609 | "\u001b[1m]\u001b[0m\n" 610 | ] 611 | }, 612 | "metadata": {}, 613 | "output_type": "display_data" 614 | } 615 | ], 616 | "source": [ 617 | "console.print(corpus_json[:2])" 618 | ] 619 | }, 620 | { 621 | "cell_type": "markdown", 622 | "metadata": {}, 623 | "source": [ 624 | "### Saving the corpus_json in a file\n", 625 | "\n", 626 | "We will want to use it in the next notebook." 627 | ] 628 | }, 629 | { 630 | "cell_type": "code", 631 | "execution_count": 19, 632 | "metadata": {}, 633 | "outputs": [], 634 | "source": [ 635 | "import json\n", 636 | "\n", 637 | "with open('data/corpus.json', 'w') as f:\n", 638 | " json.dump(corpus_json, f)\n", 639 | "\n" 640 | ] 641 | }, 642 | { 643 | "cell_type": "code", 644 | "execution_count": null, 645 | "metadata": {}, 646 | "outputs": [], 647 | "source": [] 648 | } 649 | ], 650 | "metadata": { 651 | "kernelspec": { 652 | "display_name": ".venv", 653 | "language": "python", 654 | "name": "python3" 655 | }, 656 | "language_info": { 657 | "codemirror_mode": { 658 | "name": "ipython", 659 | "version": 3 660 | }, 661 | "file_extension": ".py", 662 | "mimetype": "text/x-python", 663 | "name": "python", 664 | "nbconvert_exporter": "python", 665 | "pygments_lexer": "ipython3", 666 | "version": "3.11.0" 667 | } 668 | }, 669 | "nbformat": 4, 670 | "nbformat_minor": 2 671 | } 672 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Guy Ernest 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Mastering Large Language Models (LLM) with Retrieval Augmented Generation (RAG) 2 | 3 | This repository includes a set of assets that are used in the above course, which dives 4 | deeper into the various problems and solutions when building a RAG system in enterprise environments. 5 | 6 | The course is available on [edX](https://www.edx.org/learn/computer-science/pragmatic-ai-labs-advanced-rag). 7 | 8 | ## Simple RAG Flow 9 | 10 | ![Naive RAG](images/Naiive_RAG.png) 11 | 12 | ## Jupyter Notebooks 13 | 14 | 1. [Simple RAG](01_simple_rag.ipynb): This notebook introduces the fundamental concepts and implementation of Retrieval Augmented Generation (RAG). [![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/guyernest/advanced-rag/blob/main/01_simple_rag.ipynb) [![Open In Google Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/guyernest/advanced-rag/blob/main/01_simple_rag.ipynb) 15 | 2. [Embedding Model](02_embedding_model.ipynb): This notebook examines the role of embedding models in RAG systems, including their applications and limitations. 16 | 3. [Semantic Chunking](03_semantic_chunking.ipynb): This notebook investigates the process of semantic chunking in RAG systems, focusing on its significance in information retrieval. 17 | 4. [Contextual Retrieval](04_contextual_retrieval.ipynb): This notebook delves into the strategies for contextual retrieval in RAG systems, emphasizing the effective handling of numerical data and tables. 18 | 5. [Reverse Hyde](05_reverse_hyde.ipynb): This notebook explores the Reverse Hyde technique in RAG systems, highlighting its importance in addressing contextual relevance and query ambiguity. 19 | 6. [Hybrid Search](06_hybrid_search.ipynb): This notebook discusses the integration of hybrid search capabilities in RAG systems, encompassing both temporal relevance and multi-lingual support. 20 | 7. [Reranking](07_reranking.ipynb): This notebook discusses the importance of reranking in RAG systems, focusing on techniques to refine the initial retrieval results for more accurate and relevant outputs. 21 | 8. [Multi Modal Retrieval](08_multimodal_pdf.ipynb): This notebook explore the possibilities of retrieval from images and not limited to text. 22 | 23 | ## Common Problems in RAG Systems and Their Solutions 24 | 25 | The problematic part of a RAG application is usually the retrieval part. The system might retrieve the wrong documents, or only part of them and lead to wrong replies. This is the classical problem in machine learning of the precision-recall-tradeoff. 26 | 27 | ![Recall Precision Tradeoff](images/Recall_Precision_in_RAG_Diagram.png) 28 | 29 | The following list of complexity factors in real-life documents can lead to a decrease in overall retrieval accuracy, and the various techniques that can improve the retrieval performance. 30 | 31 | 1. **Long Documents** 32 | - Problem: Difficulty in processing and retrieving information from lengthy documents. 33 | - Solutions: 34 | - Chunking options 35 | - Sentence-based chunking 36 | - Paragraph-based chunking 37 | - Fixed-size chunking with overlap 38 | - Statistical chunking (see: [03_semantic_chunking.ipynb](03_semantic_chunking.ipynb)) 39 | - Hierarchical retrieval (e.g., parent-child chunks) 40 | - Contextual retrieval (see: [05_contextual_retrieval.ipynb](04_contextual_retrieval.ipynb)) 41 | 42 | 2. **Mismatch Between Questions and Document Formats** 43 | - Problem: User queries may not align with the way information is structured in documents. 44 | - Solutions: 45 | - Hypothetical Document Embeddings (HyDE) 46 | - Reverse HyDE (see: [04_reverse_hyde.ipynb](05_reverse_hyde.ipynb)) 47 | 48 | 3. **Domain-Specific Jargon** 49 | - Problem: General LLMs may struggle with specialized vocabulary. 50 | - Solutions: 51 | - Incorporating domain-specific embeddings (see: [02_embedding_model.ipynb](02_embedding_model.ipynb)) 52 | - Hybrid Search (see: [06_hybrid_search.ipynb](06_hybrid_search.ipynb)) 53 | - Fine-tuning on domain-specific corpora 54 | 55 | 4. **Complex Documents** 56 | - Problem: Handling documents with complex structures, such as scanned documents, which can be challenging for traditional text-based retrieval methods. 57 | - Solutions: 58 | - Multi-modal retrieval (see: [07_multimodal_pdf.ipynb](07_multimodal_pdf.ipynb)) 59 | - Utilizing computer vision techniques to extract information from images and other non-textual elements within documents. 60 | - Integrating this information with text-based retrieval methods for a more comprehensive search capability. 61 | 62 | ## Advanced RAG System 63 | 64 | ![Advanced RAG](images/Advanced_RAG.png) 65 | 66 | ## Set Up Local Jupyter 67 | 68 | To set up the project, you have two options: using `uv` or traditional `pip`. 69 | 70 | **Using [`uv`](https://github.com/astral-sh/uv)** 71 | 72 | ```shell 73 | pip install uv # Install `uv` 74 | uv venv --python cpython-3.12.3-macos-aarch64-none # Create a virtual environment for MacOS with Apple Silicon 75 | source .venv/bin/activate # Activate the virtual environment. 76 | uv pip compile requirements.in --output-file requirements.txt # Compile the project dependencies 77 | uv pip install requirements.txt # Sync the dependencies 78 | ``` 79 | 80 | ![Advanced RAG Setup](images/advanced-rag-setup.gif) 81 | 82 | 83 | If the virtual environment doesn't find `pip` or other modules: 84 | ```shell 85 | curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py # get the pip package 86 | python3 get-pip.py # Install the pip package 87 | pip install ipykernel # Install ipykernel to add the support for Jupyter notebooks to VSC 88 | ``` 89 | 90 | To execute the notebooks in a [Jupyter lab](https://jupyter.org/install) environment, you need to add the following commands 91 | ```shell 92 | python3 -m pip install jupyterlab # Install Jupyter Lab and ipykernel to manager kernels for Jupyter 93 | python3 -m ipykernel install --user --name=.venv --display-name="Python (.venv)" # Create kernel from the virtual environment 94 | jupyter lab 95 | ``` 96 | And select the `Python (.venv)` from the kernel list. 97 | 98 | **Using traditional `pip`** 99 | 100 | 1. Create a virtual environment using your preferred method (e.g., `python -m venv myenv`). 101 | 2. Activate the virtual environment. 102 | 3. Install the project dependencies using `pip install -r requirements.txt`. 103 | 104 | Note: The `uv` method is recommended for its speed, ease of use and management of project dependencies. However, the traditional `pip` method is also supported for those familiar with it. 105 | 106 | ## Setup in Google Colab 107 | 108 | Follow the link to the first hands-on lab: [![Open In Google Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/guyernest/advanced-rag/blob/main/01_simple_rag.ipynb) 109 | 110 | Before the first cells add the following commands 111 | 112 | ``` 113 | !git clone https://github.com/guyernest/advanced-rag.git 114 | %cd advanced-rag 115 | !pip install -q -r requirements.txt 116 | ``` 117 | 118 | You might need to restart the Colab runtime at the end of the dependencies installation. Remember to change back to the course folder: 119 | 120 | ``` 121 | %cd advanced-rag 122 | ``` 123 | 124 | ## Setup in SageMaker Studio Lab 125 | 126 | Follow the link to the first hands-on lab: [![Open In SageMaker Studio Lab](https://studiolab.sagemaker.aws/studiolab.svg)](https://studiolab.sagemaker.aws/import/github/guyernest/advanced-rag/blob/main/01_simple_rag.ipynb) 127 | 128 | Clone the GitHub repository when prompted by the service. 129 | 130 | Before the first cells add the following commands: 131 | 132 | ``` 133 | %cd advanced-rag 134 | !pip install -q -r requirements.txt 135 | ``` 136 | -------------------------------------------------------------------------------- /data/corpus.json: -------------------------------------------------------------------------------- 1 | [{"id": 0, "text": "4 2 0 2 n a J 8 ] G L . s c [ 1 v 8 8 0 4 0 . 1 0 4 2 : v i X r a # Mixtral of Experts Albert Q. Jiang, Alexandre Sablayrolles, Antoine Roux, Arthur Mensch, Blanche Savary, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Emma Bou Hanna, Florian Bressand, Gianna Lengyel, Guillaume Bour, Guillaume Lample, L\u00c3\u00a9lio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Sandeep Subramanian, Sophia Yang, Szymon Antoniak, Teven Le Scao, Th\u00c3\u00a9ophile Gervet, Thibaut Lavril, Thomas Wang, Timoth\u00c3\u00a9e Lacroix, William El Sayed Abstract We introduce Mixtral 8x7B, a Sparse Mixture of Experts (SMoE) language model. Mixtral has the same architecture as Mistral 7B, with the difference that each layer is composed of 8 feedforward blocks (i.e. experts). For every token, at each layer, a router network selects two experts to process the current state and combine their outputs. Even though each token only sees two experts, the selected experts can be different at each timestep. As a result, each token has access to 47B parameters, but only uses 13B active parameters during inference. Mixtral was trained with a context size of 32k tokens and it outperforms or matches Llama 2 70B and GPT-3.5 across all evaluated benchmarks. In particular, Mixtral vastly outperforms Llama 2 70B on mathematics, code generation, and multilingual benchmarks. We also provide a model fine- tuned to follow instructions, Mixtral 8x7B \u00e2 Instruct, that surpasses GPT-3.5 Turbo, Claude-2.1, Gemini Pro, and Llama 2 70B \u00e2\n\nThis chunk introduces Mixtral 8x7B, a sparse mixture of experts language model that outperforms Llama 2 70B and GPT-3.5 on various benchmarks. It also describes the model architecture and the fine-tuned Mixtral 8x7B - Instruct model.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 1, "text": "chat model on human bench- marks. Both the base and instruct models are released under the Apache 2.0 license. Code: https://github.com/mistralai/mistral-src Webpage: https://mistral.ai/news/mixtral-of-experts/ # Introduction In this paper, we present Mixtral 8x7B, a sparse mixture of experts model (SMoE) with open weights, licensed under Apache 2.0. Mixtral outperforms Llama 2 70B and GPT-3.5 on most benchmarks. As it only uses a subset of its parameters for every token, Mixtral allows faster inference speed at low batch-sizes, and higher throughput at large batch-sizes. Mixtral is a sparse mixture-of-experts network. It is a decoder-only model where the feedforward block picks from a set of 8 distinct groups of parameters. At every layer, for every token, a router network chooses two of these groups (the \u00e2\n\nThis chunk introduces Mixtral 8x7B, a sparse mixture of experts language model that outperforms Llama 2 70B and GPT-3.5 on most benchmarks. It describes the key architectural details of Mixtral, including its use of a sparse mixture-of-experts network, and mentions that the base and instruct models are released under the Apache 2.0 license.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 2, "text": "experts\u00e2 ) to process the token and combine their output additively. This technique increases the number of parameters of a model while controlling cost and latency, as the model only uses a fraction of the total set of parameters per token. Mixtral is pretrained with multilingual data using a context size of 32k tokens. It either matches or exceeds the performance of Llama 2 70B and GPT-3.5, over several benchmarks. In particular, Mixture of Experts Layer i gating inputs af outputs router expert\n\nThis chunk describes the key architectural details of the Mixtral model, a sparse mixture-of-experts language model that outperforms larger models like Llama 2 70B and GPT-3.5 on various benchmarks.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 3, "text": "Figure 1: Mixture of Experts Layer. Each input vector is assigned to 2 of the 8 experts by a router. The layer\u00e2 s output is the weighted sum of the outputs of the two selected experts. In Mixtral, an expert is a standard feedforward block as in a vanilla transformer architecture. Mixtral demonstrates superior capabilities in mathematics, code generation, and tasks that require multilingual understanding, significantly outperforming Llama 2 70B in these domains. Experiments show that Mixtral is able to successfully retrieve information from its context window of 32k tokens, regardless of the sequence length and the location of the information in the sequence. We also present Mixtral 8x7B \u00e2 Instruct, a chat model fine-tuned to follow instructions using supervised fine-tuning and Direct Preference Optimization [25]. Its performance notably surpasses that of GPT-3.5 Turbo, Claude-2.1, Gemini Pro, and Llama 2 70B \u00e2 chat model on human evaluation benchmarks. Mixtral \u00e2 Instruct also demonstrates reduced biases, and a more balanced sentiment profile in benchmarks such as BBQ, and BOLD. We release both Mixtral 8x7B and Mixtral 8x7B \u00e2\n\nThis chunk describes the Mixture of Experts layer architecture used in the Mixtral model, as well as the superior performance of Mixtral compared to other models on various benchmarks, including mathematics, code generation, and multilingual tasks. It also introduces the Mixtral 8x7B - Instruct model, which is fine-tuned to follow instructions and outperforms other chat models on human evaluation benchmarks.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 4, "text": "Instruct under the Apache 2.0 license1, free for academic and commercial usage, ensuring broad accessibility and potential for diverse applications. To enable the community to run Mixtral with a fully open-source stack, we submitted changes to the vLLM project, which integrates Megablocks CUDA kernels for efficient inference. Skypilot also allows the deployment of vLLM endpoints on any instance in the cloud. # 2 Architectural details Mixtral is based on a transformer architecture [31] and uses the same modifications as described in [18], with the notable exceptions that Mix- tral supports a fully dense context length of 32k tokens, and the feed- forward blocks are replaced by Mixture-of-Expert layers (Section 2.1). The model architecture parameters are summarized in Table 1.\n\nThis chunk describes the architectural details of the Mixtral language model, including its use of a transformer architecture with a 32k token context length and mixture-of-expert layers. It also mentions the model's open-source licensing and deployment options.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 5, "text": "Parameter Value dim n_layers head_dim hidden_dim n_heads n_kv_heads context_len vocab_size num_experts top_k_experts # 2.1 Sparse Mixture of Experts We present a brief overview of the Mixture of Experts layer (Figure 1). For a more in-depth overview, see [12]. The output of the MoE module for a given input x is determined by the weighted sum of the outputs of the expert networks, where the weights are given by the gating network\u00e2 s output. i.e. given n expert networks {E0, Ei, ..., En\u00e2 1}, the output of the expert layer is given by:\n\nThis chunk describes the architectural details of the Mixtral model, specifically the Sparse Mixture of Experts layer that is a key component of the model.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 6, "text": "Table 1: Model architecture. # j n\u00e2 G(x)i \u00c2\u00b7 Ei(x). i=0 Here, G(x)i denotes the n-dimensional output of the gating network for the i-th expert, and Ei(x) is the output of the i-th expert network. If the gating vector is sparse, we can avoid computing the outputs of experts whose gates are zero. There are multiple alternative ways of implementing G(x) [6, 15, 35], but a simple and performant one is implemented by taking the softmax over the Top-K logits of a linear layer [28].\n\nThe chunk describes the architectural details of the Mixtral model, specifically the Sparse Mixture of Experts (SMoE) layer that is used in the model.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 7, "text": "We use G(x) := Softmax(TopK(x \u00c2\u00b7 Wg)), where (TopK(\u00e2 ))i := \u00e2 i if \u00e2 i is among the top-K coordinates of logits \u00e2 \u00e2 Rn and (TopK(\u00e2 ))i := \u00e2 \u00e2 otherwise. The value of K \u00e2 the number of experts used per token \u00e2 is a hyper-parameter that modu- lates the amount of compute used to process each token. If one increases n while keeping K fixed, one # 1https://mistral.ai/news/mixtral-of-experts/\n\nThis chunk describes the gating mechanism used in the Mixture of Experts (MoE) layer of the Mixtral model. It explains how the router network selects the top-K experts to process each token, and how this allows the model to increase its parameter count while keeping the computational cost constant.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 8, "text": "2 4096 32 128 14336 32 8 32768 32000 8 2 can increase the model\u00e2 s parameter count while keeping its computational cost effectively constant. This motivates a distinction between the model\u00e2 s total parameter count (commonly referenced as the sparse parameter count), which grows with n, and the number of parameters used for processing an individual token (called the active parameter count), which grows with K up to n. MoE layers can be run efficiently on single GPUs with high performance specialized kernels. For example, Megablocks [13] casts the feed-forward network (FFN) operations of the MoE layer as large sparse matrix multiplications, significantly enhancing the execution speed and naturally handling cases where different experts get a variable number of tokens assigned to them. Moreover, the MoE layer can be distributed to multiple GPUs through standard Model Parallelism techniques, and through a particular kind of partitioning strategy called Expert Parallelism (EP) [28]. During the MoE layer\u00e2 s execution, tokens meant to be processed by a specific expert are routed to the corresponding GPU for processing, and the expert\u00e2 s output is returned to the original token location. Note that EP introduces challenges in load balancing, as it is essential to distribute the workload evenly across the GPUs to prevent overloading individual GPUs or hitting computational bottlenecks. In a Transformer model, the MoE layer is applied independently per token and replaces the feed-forward (FFN) sub-block of the transformer block. For Mixtral we use the same SwiGLU architecture as the expert function Ei(x) and set K = 2. This means each token is routed to two SwiGLU sub-blocks with different sets of weights. Taking this all together, the output y for an input token x is computed as: n-1 y= Ss Softmax(Top2(a - W,)); - SwiGLU;(a). i=0 This formulation is similar to the GShard architecture [21], with the exceptions that we replace all FFN sub-blocks by MoE layers while GShard replaces every other block, and that GShard uses a more elaborate gating strategy for the second expert assigned to each token.\n\nThis chunk describes the architectural details of the Mixtral model, specifically the Sparse Mixture of Experts (SMoE) layer that replaces the feedforward network in the Transformer architecture. It explains how the SMoE layer works, including the expert selection process, and how it compares to the GShard architecture.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 9, "text": "# 3 Results We compare Mixtral to Llama, and re-run all benchmarks with our own evaluation pipeline for fair comparison. We measure performance on a wide variety of tasks categorized as follow: \u00e2 \u00a2 Commonsense Reasoning (0-shot): Hellaswag [32], Winogrande [26], PIQA [3], SIQA [27], OpenbookQA [22], ARC-Easy, ARC-Challenge [8], CommonsenseQA [30] World Knowledge (5-shot): NaturalQuestions [20], TriviaQA [19] \u00e2 \u00a2 Reading Comprehension (0-shot): BoolQ [7], QuAC [5] \u00e2 \u00a2 Math: GSM8K [9] (8-shot) with maj@8 and MATH [17] (4-shot) with maj@4 \u00e2 \u00a2 Code: Humaneval [4] (0-shot) and MBPP [1] (3-shot) \u00e2 \u00a2 Popular aggregated results: MMLU [16] (5-shot), BBH [29] (3-shot), and AGI Eval [34] (3-5-shot, English multiple-choice questions only) 80 SE Mistral 78 = LLaMA27B = Sl LLaMA134B, jam Mistral 78 = LlaMA27B Ss LLAMA 1348, cee Mixtral 8x78 Sm LLaMA213B\u00c2\u00b0 mmm LLaMA2 70B je Mixtral 8x78 mm LlaMA2138 lm LLaMA2 708 70 50 60 50 20 40 10 BH Code MMU Knowledge Reasoning \u00e2 Comprehension AGI Eval Math \u00e2 Accuracy (%) Figure 2: Performance of Mixtral and different Llama models on a wide range of benchmarks. All models were re-evaluated on all metrics with our evaluation pipeline for accurate comparison. Mixtral outperforms or matches Llama 2 70B on all benchmarks. In particular, it is vastly superior in mathematics and code generation.\n\nThis chunk presents the results of evaluating the Mixtral 8x7B model on a wide range of benchmarks, including commonsense reasoning, world knowledge, reading comprehension, mathematics, and code generation tasks. It compares the performance of Mixtral to the Llama models, showing that Mixtral outperforms or matches Llama 2 70B on all benchmarks, particularly in mathematics and code generation.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 10, "text": "3 Active Params MMLU HellaS WinoG PIQA Arc-e Arc-c NQ TriQA HumanE MBPP Math GSM8K 7B 44.4% 77.1% 69.5% 77.9% 68.7% 43.2% 17.5% 56.6% 11.6% 26.1% 3.9% 16.0% 13B 55.6% 80.7% 72.9% 80.8% 75.2% 48.8% 16.7% 64.0% 18.9% 35.4% 6.0% 34.3% 33B 56.8% 83.7% 76.2% 82.2% 79.6% 54.4% 24.1% 68.5% 25.0% 40.9% 8.4% 44.1% 70B 69.9% 85.4% 80.4% 82.6% 79.9% 56.5% 25.4% 73.0% 29.3% 49.8% 13.8% 69.6% 7B 62.5% 81.0% 74.2% 82.2% 80.5% 54.9% 23.2% 62.5% 26.2% 50.2% 12.7% 50.0% 13B 70.6% 84.4% 77.2% 83.6% 83.1% 59.7% 30.6% 71.5% 40.2% 60.7% 28.4% 74.4% Table 2: Comparison of Mixtral with Llama. Mixtral outperforms or matches Llama 2 70B performance on almost all popular benchmarks while using 5x fewer active parameters during inference. 70 Mixtral 8x7B. \u00e2 Mixtral 8x7B Mixtral 8x7B 355 =o = Es & E60!\n\nThis chunk presents a detailed comparison of the performance of the Mixtral 8x7B model against various Llama models across a wide range of benchmarks, highlighting Mixtral's superior performance while using significantly fewer active parameters.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 11, "text": "Mistral 78 % 2681 Mistral 78 3 3 s0 5 = A % 66 50 g 4 45 64 78 138 348708 78 138 348708 78 138 348 70B S66 Mixtral 8x7B 50 Mixtral 8x7B 5 = 564 340 g al Mistral 78 ee Mistral 78 3 5 \u00c2\u00a7 30 5 eo \u00e2 = Mistral \u00c2\u00b0 20 \u00e2 e LlaMA2 78 (138 348 70B 7B (138 348 708 7B \u00c2\u00ab13B 34B 708 Active Params Active Params Active Params Figure 3: Results on MMLU, commonsense reasoning, world knowledge and reading comprehension, math and code for Mistral (7B/8x7B) vs Llama 2 (7B/13B/70B). Mixtral largely outperforms Llama 2 70B on all benchmarks, except on reading comprehension benchmarks while using 5x lower active parameters. It is also vastly superior to Llama 2 70B on code and math. Detailed results for Mixtral, Mistral 7B and Llama 2 7B/13B/70B and Llama 1 34B2 are reported in Table 2. Figure 2 compares the performance of Mixtral with the Llama models in different categories. Mixtral surpasses Llama 2 70B across most metrics. In particular, Mixtral displays a superior performance in code and mathematics benchmarks.\n\nThis chunk presents a comparison of the performance of the Mixtral 8x7B and Mistral 7B models against the Llama 2 family of models across various benchmarks, including commonsense reasoning, world knowledge, reading comprehension, math, and code generation. It highlights that Mixtral outperforms Llama 2 70B on most metrics while using significantly fewer active parameters.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 12, "text": "Size and Efficiency. We compare our performance to the Llama 2 family, aiming to understand Mixtral models\u00e2 efficiency in the cost-performance spectrum (see Figure 3). As a sparse Mixture- of-Experts model, Mixtral only uses 13B active parameters for each token. With 5x lower active parameters, Mixtral is able to outperform Llama 2 70B across most categories. Note that this analysis focuses on the active parameter count (see Section 2.1), which is directly proportional to the inference compute cost, but does not consider the memory costs and hardware utilization. The memory costs for serving Mixtral are proportional to its sparse parameter count, 47B, which is still smaller than Llama 2 70B. As for device utilization, we note that the SMoEs layer introduces additional overhead due to the routing mechanism and due to the increased memory loads when running more than one expert per device. They are more suitable for batched workloads where one can reach a good degree of arithmetic intensity. Comparison with Llama 2 70B and GPT-3.5. In Table 3, we report the performance of Mixtral 8x7B compared to Llama 2 70B and GPT-3.5. We observe that Mixtral performs similarly or above the two other models. On MMLU, Mixtral obtains a better performance, despite its significantly smaller capacity (47B tokens compared to 70B). For MT Bench, we report the performance of the latest GPT-3.5-Turbo model available, gpt-3.5-turbo-1106. 2Since Llama 2 34B was not open-sourced, we report results for Llama 1 34B.\n\nThis chunk discusses the size and efficiency of the Mixtral model, comparing its performance to the Llama 2 family of models. It highlights that Mixtral, as a sparse mixture-of-experts model, uses significantly fewer active parameters than Llama 2 70B while outperforming it across most benchmarks. The chunk also compares the performance of Mixtral 8x7B to Llama 2 70B and GPT-3.5.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 13, "text": "4 LLaMA 2 70B GPT-3.5 MMLU (MCQ in 57 subjects) 69.9% 70.0% 70.6% HellaSwag (10-shot) 87.1% 85.5% 86.7% ARC Challenge (25-shot) 85.1% 85.2% 85.8% WinoGrande (5-shot) 83.2% 81.6% 81.2% MBPP (pass@1) 49.8% 52.2% 60.7% GSM-8K (5-shot) 53.6% 57.1% 58.4% MT Bench (for Instruct Models) 6.86 8.32 8.30 # Mixtral 8x7B Table 3: Comparison of Mixtral with Llama 2 70B and GPT-3.5. Mixtral outperforms or matches Llama 2 70B and GPT-3.5 performance on most metrics. Evaluation Differences. On some benchmarks, there are some differences between our evaluation protocol and the one reported in the Llama 2 paper: 1) on MBPP, we use the hand-verified subset 2) on TriviaQA, we do not provide Wikipedia contexts. # 3.1 Multilingual benchmarks Compared to Mistral 7B, we significantly upsample the proportion of multilingual data during pretraining. The extra capacity allows Mixtral to perform well on multilingual benchmarks while maintaining a high accuracy in English. In particular, Mixtral significantly outperforms Llama 2 70B in French, German, Spanish, and Italian, as shown in Table 4.\n\nThis chunk presents a comparison of the performance of Mixtral 8x7B, Llama 2 70B, and GPT-3.5 on various benchmarks, as well as an analysis of Mixtral's performance on multilingual benchmarks.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 14, "text": "Active Params French Arc-c HellaS MMLU German Arc-c HellaS MMLU Spanish Arc-c HellaS MMLU Italian Arc-c HellaS MMLU 33B 70B 13B 42.9% 65.4% 49.0% 39.3% 68.1% 49.9% 49.9% 72.5% 64.3% 49.4% 70.9% 65.1% 58.2% 77.4% 70.9% 54.3% 73.0% 71.5% 55.4% 77.6% 72.5% 52.8% 75.1% 70.9% 41.1% 63.3% 48.7% 47.3% 68.7% 64.2% 45.7% 69.8% 52.3% 50.5% 74.5% 66.0% Table 4: Comparison of Mixtral with Llama on Multilingual Benchmarks. On ARC Challenge, Hellaswag, and MMLU, Mixtral outperforms Llama 2 70B on 4 languages: French, German, Spanish, and Italian. # 3.2 Long range performance To assess the capabilities of Mixtral to tackle long context, we evaluate it on the passkey retrieval task introduced in [23], a synthetic task designed to measure the ability of the model to retrieve a passkey inserted randomly in a long prompt. Results in Figure 4 (Left) show that Mixtral achieves a 100% retrieval accuracy regardless of the context length or the position of passkey in the sequence. Figure 4 (Right) shows that the perplexity of Mixtral on a subset of the proof-pile dataset [2] decreases monotonically as the size of the context increases. Passkey Performance ry 0.8 0.6 04 0.2 0.0 OK 4K 8K 12K 16K 20K 24K 28K Seq Len Passkey Loc\n\nThe chunk discusses Mixtral's performance on multilingual benchmarks and its ability to handle long-range context, demonstrating its strong capabilities in these areas.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 15, "text": "3.8 \u00e2 Mixtral_8x7B 3.5 32 > $3.0 i] 228 fos a 2.0 0 5k 10k 15k 20k 25k 30k Context length Passkey Performance ry 3.8 \u00e2 Mixtral_8x7B 3.5 0.8 32 > 0.6 $3.0 i] 228 04 fos 0.2 a 2.0 0.0 OK 4K 8K 12K 16K 20K 24K 28K 0 5k 10k 15k 20k 25k 30k Seq Len Context length Figure 4: Long range performance of Mixtral. (Left) Mixtral has 100% retrieval accuracy of the Passkey task regardless of the location of the passkey and length of the input sequence. (Right) The perplexity of Mixtral on the proof-pile dataset decreases monotonically as the context length increases.\n\nThe chunk discusses the long-range performance of the Mixtral model, demonstrating its ability to retrieve a passkey regardless of its location in a long input sequence, and showing that the model's perplexity on the proof-pile dataset decreases as the context length increases.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 16, "text": "5 # 3.3 Bias Benchmarks To identify possible flaws to be corrected by fine-tuning / preference modeling, we measure the base model performance on Bias Benchmark for QA (BBQ) [24] and Bias in Open-Ended Language Generation Dataset (BOLD) [10]. BBQ is a dataset of hand-written question sets that target attested social biases against nine differ- ent socially-relevant categories: age, dis- ability status, gender identity, nationality, physical appearance, race/ethnicity, religion, socio-economic status, sexual orientation. BOLD is a large-scale dataset that consists of 23,679 English text generation prompts for bias benchmarking across five domains. Llama 2 70B Mixtral 8x7B BBQ accuracy 51.5% 56.0% BOLD sentiment score (avg \u00c2\u00b1 std) gender profession religious_ideology political_ideology race 0.293 \u00c2\u00b1 0.073 0.218 \u00c2\u00b1 0.073 0.188 \u00c2\u00b1 0.133 0.149 \u00c2\u00b1 0.140 0.232 \u00c2\u00b1 0.049 0.323 \u00c2\u00b10.045 0.243 \u00c2\u00b1 0.087 0.144 \u00c2\u00b1 0.089 0.186 \u00c2\u00b1 0.146 0.232 \u00c2\u00b1 0.052\n\nThis chunk discusses the performance of the Mixtral 8x7B model on bias benchmarks, including the Bias Benchmark for QA (BBQ) and the Bias in Open-Ended Language Generation Dataset (BOLD), and compares it to the performance of the Llama 2 70B model.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 17, "text": "Figure 5: Bias Benchmarks. Compared Llama 2 70B, Mixtral presents less bias (higher accuracy on BBQ, lower std on BOLD) and displays more positive sentiment (higher avg on BOLD). We benchmark Llama 2 and Mixtral on BBQ and BOLD with our evaluation framework and report the results in Table 5. Compared to Llama 2, Mixtral presents less bias on the BBQ benchmark (56.0% vs 51.5%). For each group in BOLD, a higher average sentiment score means more positive sentiments and a lower standard deviation indicates less bias within the group. Overall, Mixtral displays more positive sentiments than Llama 2, with similar variances within each group.\n\nThe chunk discusses the performance of the Mixtral model on bias benchmarks, comparing it to the Llama 2 70B model. It shows that Mixtral presents less bias and more positive sentiment than Llama 2 70B on the BBQ and BOLD datasets.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 18, "text": "# Instruction Fine-tuning We train Mixtral \u00e2 Instruct using supervised fine-tuning (SFT) on an instruction dataset followed by Direct Preference Optimization (DPO) [25] on a paired feedback dataset. Mixtral \u00e2 Instruct reaches a score of 8.30 on MT-Bench [33] (see Table 2), making it the best open-weights model as of December 2023. Independent human evaluation conducted by LMSys is reported in Figure 63 and shows that Mixtral \u00e2 Instruct outperforms GPT-3.5-Turbo, Gemini Pro, Claude-2.1, and Llama 2 70B chat. vs Arena Elo rating 1 MT-bench (score) License 1243 9.32 Proprietary 1192 8.96 Proprietary 1158 9.18 Proprietary Glaude-4 1149 7.9 Proprietary Claude-2.0 1131 8.06 Proprietary 1121 eS) Apache 2.0 Glaude-2.4 1117 8.18 Proprietary GPT-3..5-Turbo-9613 1117 8.39 Proprietary Gemini..Pro 1141 Proprietary Glas ta 1110 7.85 Proprietary Tulu-2-0P0-708 1110 7.89 AI2 ImpACT Low-risk Yi-34B-Chat 1110 Yi License GPT-3.5:Turbo-0314 1105 7.94 Proprietary Llama-2-79b-chat 1077 6.86 Llama 2 Community Figure 6: LMSys Leaderboard. (Screenshot from Dec 22, 2023) Mixtral 8x7B Instruct v0.1 achieves an Arena Elo rating of 1121 outperforming Claude-2.1 (1117), all versions of GPT-3.5-Turbo (1117 best), Gemini Pro (1111), and Llama-2-70b-chat (1077). Mixtral is currently the best open-weights model by a large margin. 3https://huggingface.co/spaces/lmsys/chatbot-arena-leaderboard 6 # 5 Routing analysis\n\nThis chunk discusses the instruction fine-tuning of the Mixtral 8x7B model, including its performance on the MT-Bench benchmark and human evaluation results compared to other language models. It is situated within the overall document, which introduces the Mixtral 8x7B model and presents its performance on various benchmarks.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 19, "text": "In this section, we perform a small analysis on the expert selection by the router. In particular, we are interested to see if during training some experts specialized to some specific domains (e.g. mathematics, biology, philosophy, etc.). To investigate this, we measure the distribution of selected experts on different subsets of The Pile validation dataset [14]. Results are presented in Figure 7, for layers 0, 15, and 31 (layers 0 and 31 respectively being the first and the last layers of the model). Surprisingly, we do not observe obvious patterns in the assignment of experts based on the topic. For instance, at all layers, the distribution of expert assignment is very similar for ArXiv papers (written in Latex), for biology (PubMed Abstracts), and for Philosophy (PhilPapers) documents. Only for DM Mathematics we note a marginally different distribution of experts. This divergence is likely a consequence of the dataset\u00e2 s synthetic nature and its limited coverage of the natural language spectrum, and is particularly noticeable at the first and last layers, where the hidden states are very correlated to the input and output embeddings respectively. This suggests that the router does exhibit some structured syntactic behavior. Figure 8 shows examples of text from different domains (Python code, mathematics, and English), where each token is highlighted with a background color corresponding to its selected expert. The figure shows that words such as \u00e2\n\nThis chunk discusses an analysis of the expert selection behavior of the Mixtral model, examining whether experts specialize in certain domains during training. It presents results showing the distribution of expert assignments across different datasets and text types, and suggests that the expert selection exhibits more structured syntactic behavior rather than domain specialization.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 20, "text": "self\u00e2 in Python and \u00e2 Question\u00e2 in English often get routed through the same expert even though they involve multiple tokens. Similarly, in code, the indentation tokens are always assigned to the same experts, particularly at the first and last layers where the hidden states are more correlated to the input and output of the model. We also note from Figure 8 that consecutive tokens are often assigned the same experts. In fact, we observe some degree of positional locality in The Pile datasets. Table 5 shows the proportion of con- secutive tokens that get the same expert assignments per domain and layer. The proportion of repeated 0.20 0.15 0.10 0.05 layer: 15 0.20 0.15 0.10 0.05 layer: 31 Selection proportion 0.20 0.15 0.10 0.05 Expert ID | | ArXiv | Github | | PhilPapers | StackExchange | | DM Mathematics | | Gutenberg | | PubMed Abstracts | | Wikipedia (en) Figure 7: Proportion of tokens assigned to each expert on different domains from The Pile dataset for layers 0, 15, and 31. The gray dashed vertical line marks 1/8, i.e. the proportion expected with uniform sampling. Here, we consider experts that are either selected as a first or second choice by the router. A breakdown of the proportion of assignments done in each case cane be seen in Figure 9 in the Appendix. 7\n\nThis chunk discusses the analysis of the expert selection by the router in the Mixtral model. It examines the patterns of expert assignment across different text domains and layers of the model, highlighting the tendency for consecutive tokens to be assigned to the same experts, particularly at the first and last layers.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 21, "text": "Layer 0 First choice Layer 15 Layer 31 Layer 0 First or second choice Layer 15 Layer 31 ArXiv DM Mathematics Github Gutenberg PhilPapers PubMed Abstracts StackExchange Wikipedia (en) 14.0% 14.1% 14.9% 13.9% 13.6% 14.2% 13.6% 14.4% 27.9% 28.4% 28.1% 26.1% 25.3% 24.6% 27.2% 23.6% 22.7% 19.7% 19.7% 26.3% 22.1% 22.0% 23.6% 25.3% 46.5% 44.9% 49.9% 49.5% 46.9% 48.6% 48.2% 49.8% 62.3% 67.0% 66.9% 63.1% 61.9% 61.6% 64.6% 62.1% 52.9% 44.5% 49.2% 52.2% 51.3% 51.8% 53.6% 51.8% Table 5: Percentage of expert assignment repetitions. We evaluate the proportion of times the same expert is assigned to a token i and its following token i+1. We report whether the first chosen expert is the same, or whether the same expert is observed as first or second choice in consecutive tokens. For reference, the expected proportion of repetitions in the case of random assignments is 1 5 7 \u00e2 46% for \u00e2 First and second choice\u00e2\n\nThis chunk discusses the analysis of expert assignment repetitions in the Mixtral model, showing that there is a high degree of temporal locality in the expert selection, especially at higher layers of the model.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 22, "text": ". Repetitions at the first layer are close to random, but are significantly higher at layers 15 and 31. The high number of repetitions shows that expert choice exhibits high temporal locality at these layers. consecutive assignments is significantly higher than random for higher layers. This has implications in how one might optimize the model for fast training and inference. For example, cases with high locality are more likely to cause over-subscription of certain experts when doing Expert Parallelism. Conversely, this locality can be leveraged for caching, as is done in [11]. A more complete view of these same expert frequency is provided for all layers and across datasets in Figure 10 in the Appendix.\n\nThis chunk discusses the temporal locality observed in the expert assignments made by the Mixtral model, and how this has implications for optimizing the model for training and inference.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 23, "text": "# 6 Conclusion In this paper, we introduced Mixtral 8x7B, the first mixture-of-experts network to reach a state-of-the- art performance among open-source models. Mixtral 8x7B Instruct outperforms Claude-2.1, Gem- ini Pro, and GPT-3.5 Turbo on human evaluation benchmarks. Because it only uses two experts at each time step, Mixtral only uses 13B active parameters per token while outperforming the previous best model using 70B parameters per token (Llama 2 70B). We are making our trained and fine-tuned mod- els publicly available under the Apache 2.0 license. By sharing our models, we aim to facilitate the de- velopment of new techniques and applications that can benefit a wide range of industries and domains. Layer 0 Layer 15 Layer 31 class MoeLayer(nn.\n\nThis chunk is the concluding section of the paper, where the authors introduce Mixtral 8x7B, a mixture-of-experts language model that outperforms other open-source models while using fewer active parameters. The authors also mention making the trained and fine-tuned models publicly available under the Apache 2.0 license.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 24, "text": "Module) : \u00e2 init__(self, experts//List [nn.Modutel,) | Super (V7 init assert len(experts) > 0 self. experts = nn.ModuleList((experts) self. gate = gate self.args = moe_args def forward(self, inputs: torch.Tensor): inputs _squashed = inputs. view(-1,_ inputs.| gate_logits = self.gatel inputs_squashed) weights, selected_experts = torch. topk( gate_logits, Self-args.nun_experts_\u00c3\u00a9 weights! = nri.|funct ional softinax'( weights, din=1, dtype=torch. float, ).type_as|(inputs) results| = torch. zeros_ ike! linputs_squashe for i, expert in enunerate(self. experts): batch_idx,! nth_expert = torch. wnere( results [batch_idx] += weights [batch_i input s_squashed [batch_idx] ) return resutts:.view las{(inputs) class NoeLayer (nn. Module) = def _ init__(self, experts! List'{nri.Modulelly Super (Tz init_t assert len (experts) > 9) self.experts = nn. ModuleList((experits)) def forward(self, inputs: torch. Tensor)?! inputs_squashed = inputs.View(-1) inputs) gate_logits = self.gatel inputs_squashed) weights, selected_experts = torch. topk( getellogits, self.argssnun_experts pe weights\u00e2 = nn. functionallsoftmax(\u00c2\u00ae Weights, dtypextorch. floaty ) type_as (inputs) results| = torch. zerdsillikel(input siiequashe| for i, expert in enumerate (self. experts): batch idx, nth_expert = torch.where(s results [batch_idx] += weights [batch_i\u00c2\u00a2 inputs|_squashed[batch idx], y return resultsiiview jas (inputs) class| MoeLayer(nn. Module): def init__(self, experts\u00e2 List|fifi.Modulel) Super(Ve_init_O) assert len(experts) > 0 self, experts = nn.ModuleListl(@xperits)) self. gate = gate Self.args = moe_args def forward(self, inputs: torch.\n\nThe chunk describes the implementation details of the Mixture-of-Experts (MoE) layer used in the Mixtral language model, including the forward pass and the expert selection mechanism.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 25, "text": "Tensor): inputs_squashed = inputs.view(=1, inputs) gate_logits = self.gate( inputs_squashed) weights, selected_experts = torch. topk( gate_logits, self.argssfum_experts_pe weights) nni.unct iorial.isoftinax( YP Yiitype_as (inputs) results = torch. zerosillikel(inputslisquashe| for i, expert in enunerate(self.experts): batch_idx, nth_expert = torch.where(s results [batch_idx] += weights [batch_i\u00c2\u00a2 inputs_squashed [batch_idx] ) return) results\\iviewilas|(inputs)) Tuestiond] Solve \u00e2 AINr 27K SLIT! and SORT, lanswers 4 Question?\u00e2\n\nThis chunk appears to be discussing the implementation details of the MoE (Mixture of Experts) layer in the Mixtral language model. It shows the forward pass of the MoE layer, where the input is passed through the gating network to select the top-K experts, and the outputs of the selected experts are combined to produce the final output.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 26, "text": "Calculate Baiasoazusaaly 4111270 iAnswer: -841469015.544 (Question! Let\u00e2 x(gy = 94g # Hl Let! q(clJ= Zee #] IAnswer: S4ea - 30 \u00e2 Question#! Solve Azer \u00c2\u00a5 27HE = Ate and 1505 lanswer:) 4 Calculate ~eaieseiaz. saa \u00c2\u00a5 417127. ~841469015.544 \u00e2 Answer: (Questor \u00e2 Answer: et\u00e2 x(q) = 9*g \u00c2\u00a5 Wl Let! ql)! = 2eele Sara \u00e2 30 question Solve -42\u00c2\u00a5e1E B7eC= \u00e2 Ad67 and 130%] answers \\question\u00c2\u00ae| calculate savesona2.saq + auaz7. Answer: -847469015.544 \u00e2 O\u00c2\u00a5o)H A Let q(el = (questiond! Let! x(a) = awed | Answers 54a ~ \u00e2\n\nThis chunk appears to contain mathematical questions and answers, likely related to the evaluation of the Mixtral language model on mathematical benchmarks, as discussed in the overall document.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 27, "text": "A model airplane flies stower when flying into tt jwind and faster with wind at its back. when Launcl Iright angles to the wind,\u00e2 cross wind,| its groun Icompared with! flying in still air is (A) the same (B) greater (C) less (0)! either! grea lor less depending\u00e2 on wind speed i nodelaitp ane) URE slover when flying into eH lind and faster with wind at its back. When) launch Tight angles to the wind, a cross wind,. its) grounc Compared with \u00e2 lying in stitt air is (A) the same (18) greater) (C) less (D)! either grea lor less depending on wind speed H model airplane flies slower! when flying inte th wind and faster with wind at its back\u00e2\n\nThis chunk appears to be a question about the behavior of a model airplane in different wind conditions, which is used as an example in the document.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 28, "text": ". When Launcl [right angles to the wind, a cross wind, its grounc Icompared with flying in still air is (A) the sane (B) greater (C) less (0)! either gree jor less depending on wind speed Figure 8: Text samples where each token is colored with the first expert choice. The selection of experts appears to be more aligned with the syntax rather than the domain, especially at the initial and final layers. 8 # Acknowledgements We thank the CoreWeave and Scaleway teams for technical support as we trained our models. We are grateful to NVIDIA for supporting us in integrating TensorRT-LLM and Triton and working alongside us to make a sparse mixture of experts compatible with TensorRT-LLM. # References [1] Jacob Austin, Augustus Odena, Maxwell Nye, Maarten Bosma, Henryk Michalewski, David Dohan, Ellen Jiang, Carrie Cai, Michael Terry, Quoc Le, et al. Program synthesis with large language models. arXiv preprint arXiv:2108.07732, 2021. [2] Zhangir Azerbayev, Hailey Schoelkopf, Keiran Paster, Marco Dos Santos, Stephen McAleer, Albert Q Jiang, Jia Deng, Stella Biderman, and Sean Welleck.\n\nThe chunk discusses the behavior of a model airplane when launched at right angles to the wind, and provides an analysis of the expert selection in the Mixtral model. This is part of the overall paper that introduces the Mixtral 8x7B language model.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 29, "text": "Llemma: An open language model for mathematics. arXiv preprint arXiv:2310.10631, 2023. [3] Yonatan Bisk, Rowan Zellers, Jianfeng Gao, Yejin Choi, et al. Piqa: Reasoning about phys- ical commonsense in natural language. In Proceedings of the AAAI conference on artificial intelligence, pages 7432\u00e2 7439, 2020. [4] Mark Chen, Jerry Tworek, Heewoo Jun, Qiming Yuan, Henrique Ponde de Oliveira Pinto, Jared Kaplan, Harri Edwards, Yuri Burda, Nicholas Joseph, Greg Brockman, et al. Evaluating large language models trained on code. arXiv preprint arXiv:2107.03374, 2021. [5] Eunsol Choi, He He, Mohit Iyyer, Mark Yatskar, Wen-tau Yih, Yejin Choi, Percy Liang, and Luke Zettlemoyer.\n\nThe chunk contains references to various datasets and papers related to language models, including a reference to an open language model for mathematics.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 30, "text": "Quac: Question answering in context. arXiv preprint arXiv:1808.07036, 2018. [6] Aidan Clark, Diego De Las Casas, Aurelia Guy, Arthur Mensch, Michela Paganini, Jordan Hoffmann, Bogdan Damoc, Blake Hechtman, Trevor Cai, Sebastian Borgeaud, et al. Unified scaling laws for routed language models. In International Conference on Machine Learning, pages 4057\u00e2 4086. PMLR, 2022. [7] Christopher Clark, Kenton Lee, Ming-Wei Chang, Tom Kwiatkowski, Michael Collins, and Kristina Toutanova.\n\nThe chunk discusses references related to question answering and language models, which are relevant topics covered in the overall document.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 31, "text": "Boolq: Exploring the surprising difficulty of natural yes/no questions. arXiv preprint arXiv:1905.10044, 2019. [8] Peter Clark, Isaac Cowhey, Oren Etzioni, Tushar Khot, Ashish Sabharwal, Carissa Schoenick, and Oyvind Tafjord. Think you have solved question answering? try arc, the ai2 reasoning challenge. arXiv preprint arXiv:1803.05457, 2018. [9] Karl Cobbe, Vineet Kosaraju, Mohammad Bavarian, Mark Chen, Heewoo Jun, Lukasz Kaiser, Matthias Plappert, Jerry Tworek, Jacob Hilton, Reiichiro Nakano, et al. Training verifiers to solve math word problems. arXiv preprint arXiv:2110.14168, 2021. [10] Jwala Dhamala, Tony Sun, Varun Kumar, Satyapriya Krishna, Yada Pruksachatkun, Kai-Wei Chang, and Rahul Gupta.\n\nThis chunk discusses various datasets and benchmarks used to evaluate language models, including BoolQ, ARC, and math word problems.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 32, "text": "Bold: Dataset and metrics for measuring biases in open-ended language generation. In Proceedings of the 2021 ACM conference on fairness, accountability, and transparency, pages 862\u00e2 872, 2021. [11] Artyom Eliseev and Denis Mazur. Fast inference of mixture-of-experts language models with offloading. arXiv preprint arXiv:2312.17238, 2023. [12] William Fedus, Jeff Dean, and Barret Zoph. A review of sparse expert models in deep learning. arXiv preprint arXiv:2209.01667, 2022. [13] Trevor Gale, Deepak Narayanan, Cliff Young, and Matei Zaharia.\n\nThe chunk discusses related work on measuring biases in language models, fast inference of mixture-of-experts models, and a review of sparse expert models in deep learning.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 33, "text": "Megablocks: Efficient sparse training with mixture-of-experts. arXiv preprint arXiv:2211.15841, 2022. [14] Leo Gao, Stella Biderman, Sid Black, Laurence Golding, Travis Hoppe, Charles Foster, Jason Phang, Horace He, Anish Thite, Noa Nabeshima, et al. The pile: An 800gb dataset of diverse text for language modeling. arXiv preprint arXiv:2101.00027, 2020. [15] Hussein Hazimeh, Zhe Zhao, Aakanksha Chowdhery, Maheswaran Sathiamoorthy, Yihua Chen, Rahul Mazumder, Lichan Hong, and Ed Chi.\n\nThe chunk discusses the Megablocks technique for efficient sparse training with mixture-of-experts models, as well as the Pile dataset used for language modeling, and a paper on differentiable selection in mixture-of-experts models.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 34, "text": "Dselect-k: Differentiable selection in the mixture of experts with applications to multi-task learning. Advances in Neural Information Processing Systems, 34:29335\u00e2 29347, 2021. 9 [16] Dan Hendrycks, Collin Burns, Steven Basart, Andy Zou, Mantas Mazeika, Dawn Song, and Jacob Steinhardt. Measuring massive multitask language understanding. arXiv preprint arXiv:2009.03300, 2020. [17] Dan Hendrycks, Collin Burns, Saurav Kadavath, Akul Arora, Steven Basart, Eric Tang, Dawn Song, and Jacob Steinhardt. Measuring mathematical problem solving with the math dataset. arXiv preprint arXiv:2103.03874, 2021. [18] Albert Q Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lucile Saulnier, et al. Mistral 7b. arXiv preprint arXiv:2310.06825, 2023. [19] Mandar Joshi, Eunsol Choi, Daniel S Weld, and Luke Zettlemoyer.\n\nThe chunk discusses related work on mixture-of-experts models and language understanding benchmarks, which are relevant to the introduction and results sections of the Mixtral paper.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 35, "text": "Triviaqa: A large scale distantly supervised challenge dataset for reading comprehension. arXiv preprint arXiv:1705.03551, 2017. [20] Tom Kwiatkowski, Jennimaria Palomaki, Olivia Redfield, Michael Collins, Ankur Parikh, Chris Alberti, Danielle Epstein, Illia Polosukhin, Jacob Devlin, Kenton Lee, et al. Natural questions: a benchmark for question answering research. Transactions of the Association for Computational Linguistics, pages 453\u00e2 466, 2019. [21] Dmitry Lepikhin, HyoukJoong Lee, Yuanzhong Xu, Dehao Chen, Orhan Firat, Yanping Huang, Maxim Krikun, Noam Shazeer, and Zhifeng Chen.\n\nThe chunk discusses references related to datasets and models used in the paper, specifically TriviaQA and Natural Questions datasets, as well as the GShard model.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 36, "text": "Gshard: Scaling giant models with condi- tional computation and automatic sharding. arXiv preprint arXiv:2006.16668, 2020. [22] Todor Mihaylov, Peter Clark, Tushar Khot, and Ashish Sabharwal. Can a suit of armor conduct electricity? a new dataset for open book question answering. arXiv preprint arXiv:1809.02789, 2018. [23] Amirkeivan Mohtashami and Martin Jaggi.\n\nThe chunk discusses related work on mixture-of-experts models and datasets for open-book question answering, which are relevant to the technical details and evaluations presented in the overall document.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 37, "text": "Landmark attention: Random-access infinite context length for transformers. arXiv preprint arXiv:2305.16300, 2023. [24] Alicia Parrish, Angelica Chen, Nikita Nangia, Vishakh Padmakumar, Jason Phang, Jana Thomp- son, Phu Mon Htut, and Samuel R Bowman. Bbq: A hand-built bias benchmark for question answering. arXiv preprint arXiv:2110.08193, 2021. [25] Rafael Rafailov, Archit Sharma, Eric Mitchell, Stefano Ermon, Christopher D Manning, and Chelsea Finn. Direct preference optimization: Your language model is secretly a reward model. arXiv preprint arXiv:2305.18290, 2023. [26] Keisuke Sakaguchi, Ronan Le Bras, Chandra Bhagavatula, and Yejin Choi.\n\nThe chunk discusses references related to long-range performance, bias benchmarks, and instruction fine-tuning of language models, which are topics covered in the overall document.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 38, "text": "Winogrande: An adversarial winograd schema challenge at scale. Communications of the ACM, pages 99\u00e2 106, 2021. [27] Maarten Sap, Hannah Rashkin, Derek Chen, Ronan LeBras, and Yejin Choi. Socialiqa: Com- monsense reasoning about social interactions. arXiv preprint arXiv:1904.09728, 2019. [28] Noam Shazeer, Azalia Mirhoseini, Krzysztof Maziarz, Andy Davis, Quoc Le, Geoffrey Hinton, and Jeff Dean.\n\nThis chunk discusses references related to commonsense reasoning and mixture-of-experts models, which are relevant topics covered in the overall document.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 39, "text": "Outrageously large neural networks: The sparsely-gated mixture-of-experts layer. arXiv preprint arXiv:1701.06538, 2017. [29] Mirac Suzgun, Nathan Scales, Nathanael Sch\u00c3\u00a4rli, Sebastian Gehrmann, Yi Tay, Hyung Won Chung, Aakanksha Chowdhery, Quoc V Le, Ed H Chi, Denny Zhou, , and Jason Wei. Challenging big-bench tasks and whether chain-of-thought can solve them. arXiv preprint arXiv:2210.09261, 2022. [30] Alon Talmor, Jonathan Herzig, Nicholas Lourie, and Jonathan Berant.\n\nThe chunk discusses prior work on mixture-of-experts models and challenging language model benchmarks, which is relevant background information for the Mixtral model presented in the document.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 40, "text": "Commonsenseqa: A ques- tion answering challenge targeting commonsense knowledge. arXiv preprint arXiv:1811.00937, 2018. [31] Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u00c5 ukasz Kaiser, and Illia Polosukhin. Attention is all you need. Advances in neural information processing systems, 30, 2017. [32] Rowan Zellers, Ari Holtzman, Yonatan Bisk, Ali Farhadi, and Yejin Choi.\n\nThis chunk appears to be a list of references cited in the document, specifically references [30], [31], and [32].", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 41, "text": "Hellaswag: Can a machine really finish your sentence? arXiv preprint arXiv:1905.07830, 2019. [33] Lianmin Zheng, Wei-Lin Chiang, Ying Sheng, Siyuan Zhuang, Zhanghao Wu, Yonghao Zhuang, Zi Lin, Zhuohan Li, Dacheng Li, Eric Xing, et al. Judging llm-as-a-judge with mt-bench and chatbot arena. arXiv preprint arXiv:2306.05685, 2023.\n\nThe chunk discusses two references related to language model benchmarking, including the Hellaswag dataset and the MT-Bench and Chatbot Arena benchmarks. This is situated within the broader context of the paper, which introduces the Mixtral language model and evaluates its performance on various benchmarks.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 42, "text": "10 [34] Wanjun Zhong, Ruixiang Cui, Yiduo Guo, Yaobo Liang, Shuai Lu, Yanlin Wang, Amin Saied, Weizhu Chen, and Nan Duan. Agieval: A human-centric benchmark for evaluating foundation models. arXiv preprint arXiv:2304.06364, 2023. [35] Yanqi Zhou, Tao Lei, Hanxiao Liu, Nan Du, Yanping Huang, Vincent Zhao, Andrew M Dai, Quoc V Le, James Laudon, et al.\n\nThe chunk contains references to related work on evaluating foundation models, which is relevant to the overall topic of the document discussing the Mixtral language model.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 43, "text": "Mixture-of-experts with expert choice routing. Advances in Neural Information Processing Systems, 35:7103\u00e2 7114, 2022. 11 # Either choice 0 Layer -- 0.3 0.2 0 Layer 0 -- First choice 0.3 Layer 0 -- Second choice 0.3 < 2 t Layer 15 -- First choice fe} Q 0.3 \u00c2\u00b0 a 0.2 el (el er rere! ie it len | ie} o 0 v Layer 15 -- Second choice 8 03 0.2 0 Layer 31 -- Either choice # Expert ID\n\nThis chunk discusses the expert assignment patterns observed in the Mixtral model, analyzing the proportion of tokens assigned to each expert at different layers of the model. It provides insights into the temporal locality and syntactic alignment of the expert selection.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 44, "text": "ArXiv Github PhilPapers. StackExchange |_| | |_| | | DM Mathematics | Gutenberg || PubMed Abstracts | Wikipedia (en) Figure 9: Proportion of tokens assigned to each expert on different subsets from The Pile dataset, separated by whether the expert was selected as first or second choice, or either. The \u00e2 Either choice\u00e2 case is equivalent to Figure 7. The gray dashed vertical line marks 1 12 First choice 9 w is) \u00c2\u00b0 N a \u00c2\u00b0 N is) \u00c2\u00b0 An wu 0.7 0.6 Proportion of repeated assignments 0.5 Layer source \u00e2\n\nThis chunk discusses the distribution of expert assignments across different subsets of The Pile dataset, and the proportion of repeated expert assignments across layers and datasets. It provides additional details and visualizations to the analysis presented in the main body of the paper.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 45, "text": "e ArXiv \u00e2 e\u00e2 DM Mathematics \u00e2 e Github \u00e2 e\u00e2 Gutenberg \u00e2 e\u00e2 PhilPapers \u00e2 e\u00e2 PubMed \u00e2 e- StackExchange \u00e2 e-\u00e2 Wikipedia (en) # Abstracts Figure 10: Repeated consecutive assignments per MoE layer. Repeated assignments occur a lot more often than they would with uniform assignments (materialized by the dashed lines). Patterns are similar across datasets with less repetitions for DM Mathematics. 13\n\nThis chunk discusses the analysis of expert assignment patterns in the Mixtral model, showing that there is a high degree of temporal locality in the expert selection, especially at higher layers of the model. This analysis provides insights into the behavior of the sparse mixture-of-experts architecture used in Mixtral.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}] -------------------------------------------------------------------------------- /data/dense_results.json: -------------------------------------------------------------------------------- 1 | [{"id": 15, "text": "3.8 \u00e2 Mixtral_8x7B 3.5 32 > $3.0 i] 228 fos a 2.0 0 5k 10k 15k 20k 25k 30k Context length Passkey Performance ry 3.8 \u00e2 Mixtral_8x7B 3.5 0.8 32 > 0.6 $3.0 i] 228 04 fos 0.2 a 2.0 0.0 OK 4K 8K 12K 16K 20K 24K 28K 0 5k 10k 15k 20k 25k 30k Seq Len Context length Figure 4: Long range performance of Mixtral. (Left) Mixtral has 100% retrieval accuracy of the Passkey task regardless of the location of the passkey and length of the input sequence. (Right) The perplexity of Mixtral on the proof-pile dataset decreases monotonically as the context length increases.\n\nThe chunk discusses the long-range performance of the Mixtral model, demonstrating its ability to retrieve a passkey regardless of its location in a long input sequence, and showing that the model's perplexity on the proof-pile dataset decreases as the context length increases.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 4, "text": "Instruct under the Apache 2.0 license1, free for academic and commercial usage, ensuring broad accessibility and potential for diverse applications. To enable the community to run Mixtral with a fully open-source stack, we submitted changes to the vLLM project, which integrates Megablocks CUDA kernels for efficient inference. Skypilot also allows the deployment of vLLM endpoints on any instance in the cloud. # 2 Architectural details Mixtral is based on a transformer architecture [31] and uses the same modifications as described in [18], with the notable exceptions that Mix- tral supports a fully dense context length of 32k tokens, and the feed- forward blocks are replaced by Mixture-of-Expert layers (Section 2.1). The model architecture parameters are summarized in Table 1.\n\nThis chunk describes the architectural details of the Mixtral language model, including its use of a transformer architecture with a 32k token context length and mixture-of-expert layers. It also mentions the model's open-source licensing and deployment options.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 2, "text": "experts\u00e2 ) to process the token and combine their output additively. This technique increases the number of parameters of a model while controlling cost and latency, as the model only uses a fraction of the total set of parameters per token. Mixtral is pretrained with multilingual data using a context size of 32k tokens. It either matches or exceeds the performance of Llama 2 70B and GPT-3.5, over several benchmarks. In particular, Mixture of Experts Layer i gating inputs af outputs router expert\n\nThis chunk describes the key architectural details of the Mixtral model, a sparse mixture-of-experts language model that outperforms larger models like Llama 2 70B and GPT-3.5 on various benchmarks.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 6, "text": "Table 1: Model architecture. # j n\u00e2 G(x)i \u00c2\u00b7 Ei(x). i=0 Here, G(x)i denotes the n-dimensional output of the gating network for the i-th expert, and Ei(x) is the output of the i-th expert network. If the gating vector is sparse, we can avoid computing the outputs of experts whose gates are zero. There are multiple alternative ways of implementing G(x) [6, 15, 35], but a simple and performant one is implemented by taking the softmax over the Top-K logits of a linear layer [28].\n\nThe chunk describes the architectural details of the Mixtral model, specifically the Sparse Mixture of Experts (SMoE) layer that is used in the model.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 7, "text": "We use G(x) := Softmax(TopK(x \u00c2\u00b7 Wg)), where (TopK(\u00e2 ))i := \u00e2 i if \u00e2 i is among the top-K coordinates of logits \u00e2 \u00e2 Rn and (TopK(\u00e2 ))i := \u00e2 \u00e2 otherwise. The value of K \u00e2 the number of experts used per token \u00e2 is a hyper-parameter that modu- lates the amount of compute used to process each token. If one increases n while keeping K fixed, one # 1https://mistral.ai/news/mixtral-of-experts/\n\nThis chunk describes the gating mechanism used in the Mixture of Experts (MoE) layer of the Mixtral model. It explains how the router network selects the top-K experts to process each token, and how this allows the model to increase its parameter count while keeping the computational cost constant.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 42, "text": "10 [34] Wanjun Zhong, Ruixiang Cui, Yiduo Guo, Yaobo Liang, Shuai Lu, Yanlin Wang, Amin Saied, Weizhu Chen, and Nan Duan. Agieval: A human-centric benchmark for evaluating foundation models. arXiv preprint arXiv:2304.06364, 2023. [35] Yanqi Zhou, Tao Lei, Hanxiao Liu, Nan Du, Yanping Huang, Vincent Zhao, Andrew M Dai, Quoc V Le, James Laudon, et al.\n\nThe chunk contains references to related work on evaluating foundation models, which is relevant to the overall topic of the document discussing the Mixtral language model.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 45, "text": "e ArXiv \u00e2 e\u00e2 DM Mathematics \u00e2 e Github \u00e2 e\u00e2 Gutenberg \u00e2 e\u00e2 PhilPapers \u00e2 e\u00e2 PubMed \u00e2 e- StackExchange \u00e2 e-\u00e2 Wikipedia (en) # Abstracts Figure 10: Repeated consecutive assignments per MoE layer. Repeated assignments occur a lot more often than they would with uniform assignments (materialized by the dashed lines). Patterns are similar across datasets with less repetitions for DM Mathematics. 13\n\nThis chunk discusses the analysis of expert assignment patterns in the Mixtral model, showing that there is a high degree of temporal locality in the expert selection, especially at higher layers of the model. This analysis provides insights into the behavior of the sparse mixture-of-experts architecture used in Mixtral.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 0, "text": "4 2 0 2 n a J 8 ] G L . s c [ 1 v 8 8 0 4 0 . 1 0 4 2 : v i X r a # Mixtral of Experts Albert Q. Jiang, Alexandre Sablayrolles, Antoine Roux, Arthur Mensch, Blanche Savary, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Emma Bou Hanna, Florian Bressand, Gianna Lengyel, Guillaume Bour, Guillaume Lample, L\u00c3\u00a9lio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Sandeep Subramanian, Sophia Yang, Szymon Antoniak, Teven Le Scao, Th\u00c3\u00a9ophile Gervet, Thibaut Lavril, Thomas Wang, Timoth\u00c3\u00a9e Lacroix, William El Sayed Abstract We introduce Mixtral 8x7B, a Sparse Mixture of Experts (SMoE) language model. Mixtral has the same architecture as Mistral 7B, with the difference that each layer is composed of 8 feedforward blocks (i.e. experts). For every token, at each layer, a router network selects two experts to process the current state and combine their outputs. Even though each token only sees two experts, the selected experts can be different at each timestep. As a result, each token has access to 47B parameters, but only uses 13B active parameters during inference. Mixtral was trained with a context size of 32k tokens and it outperforms or matches Llama 2 70B and GPT-3.5 across all evaluated benchmarks. In particular, Mixtral vastly outperforms Llama 2 70B on mathematics, code generation, and multilingual benchmarks. We also provide a model fine- tuned to follow instructions, Mixtral 8x7B \u00e2 Instruct, that surpasses GPT-3.5 Turbo, Claude-2.1, Gemini Pro, and Llama 2 70B \u00e2\n\nThis chunk introduces Mixtral 8x7B, a sparse mixture of experts language model that outperforms Llama 2 70B and GPT-3.5 on various benchmarks. It also describes the model architecture and the fine-tuned Mixtral 8x7B - Instruct model.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 5, "text": "Parameter Value dim n_layers head_dim hidden_dim n_heads n_kv_heads context_len vocab_size num_experts top_k_experts # 2.1 Sparse Mixture of Experts We present a brief overview of the Mixture of Experts layer (Figure 1). For a more in-depth overview, see [12]. The output of the MoE module for a given input x is determined by the weighted sum of the outputs of the expert networks, where the weights are given by the gating network\u00e2 s output. i.e. given n expert networks {E0, Ei, ..., En\u00e2 1}, the output of the expert layer is given by:\n\nThis chunk describes the architectural details of the Mixtral model, specifically the Sparse Mixture of Experts layer that is a key component of the model.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 11, "text": "Mistral 78 % 2681 Mistral 78 3 3 s0 5 = A % 66 50 g 4 45 64 78 138 348708 78 138 348708 78 138 348 70B S66 Mixtral 8x7B 50 Mixtral 8x7B 5 = 564 340 g al Mistral 78 ee Mistral 78 3 5 \u00c2\u00a7 30 5 eo \u00e2 = Mistral \u00c2\u00b0 20 \u00e2 e LlaMA2 78 (138 348 70B 7B (138 348 708 7B \u00c2\u00ab13B 34B 708 Active Params Active Params Active Params Figure 3: Results on MMLU, commonsense reasoning, world knowledge and reading comprehension, math and code for Mistral (7B/8x7B) vs Llama 2 (7B/13B/70B). Mixtral largely outperforms Llama 2 70B on all benchmarks, except on reading comprehension benchmarks while using 5x lower active parameters. It is also vastly superior to Llama 2 70B on code and math. Detailed results for Mixtral, Mistral 7B and Llama 2 7B/13B/70B and Llama 1 34B2 are reported in Table 2. Figure 2 compares the performance of Mixtral with the Llama models in different categories. Mixtral surpasses Llama 2 70B across most metrics. In particular, Mixtral displays a superior performance in code and mathematics benchmarks.\n\nThis chunk presents a comparison of the performance of the Mixtral 8x7B and Mistral 7B models against the Llama 2 family of models across various benchmarks, including commonsense reasoning, world knowledge, reading comprehension, math, and code generation. It highlights that Mixtral outperforms Llama 2 70B on most metrics while using significantly fewer active parameters.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}] -------------------------------------------------------------------------------- /data/shokz/OpenRun Pro User Guide.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guyernest/advanced-rag/63d9ec59daf82e60023d3a336988e32d6b6c1d4a/data/shokz/OpenRun Pro User Guide.pdf -------------------------------------------------------------------------------- /data/sparse_results.json: -------------------------------------------------------------------------------- 1 | [{"id": 14, "text": "Active Params French Arc-c HellaS MMLU German Arc-c HellaS MMLU Spanish Arc-c HellaS MMLU Italian Arc-c HellaS MMLU 33B 70B 13B 42.9% 65.4% 49.0% 39.3% 68.1% 49.9% 49.9% 72.5% 64.3% 49.4% 70.9% 65.1% 58.2% 77.4% 70.9% 54.3% 73.0% 71.5% 55.4% 77.6% 72.5% 52.8% 75.1% 70.9% 41.1% 63.3% 48.7% 47.3% 68.7% 64.2% 45.7% 69.8% 52.3% 50.5% 74.5% 66.0% Table 4: Comparison of Mixtral with Llama on Multilingual Benchmarks. On ARC Challenge, Hellaswag, and MMLU, Mixtral outperforms Llama 2 70B on 4 languages: French, German, Spanish, and Italian. # 3.2 Long range performance To assess the capabilities of Mixtral to tackle long context, we evaluate it on the passkey retrieval task introduced in [23], a synthetic task designed to measure the ability of the model to retrieve a passkey inserted randomly in a long prompt. Results in Figure 4 (Left) show that Mixtral achieves a 100% retrieval accuracy regardless of the context length or the position of passkey in the sequence. Figure 4 (Right) shows that the perplexity of Mixtral on a subset of the proof-pile dataset [2] decreases monotonically as the size of the context increases. Passkey Performance ry 0.8 0.6 04 0.2 0.0 OK 4K 8K 12K 16K 20K 24K 28K Seq Len Passkey Loc\n\nThe chunk discusses Mixtral's performance on multilingual benchmarks and its ability to handle long-range context, demonstrating its strong capabilities in these areas.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 2, "text": "experts\u00e2 ) to process the token and combine their output additively. This technique increases the number of parameters of a model while controlling cost and latency, as the model only uses a fraction of the total set of parameters per token. Mixtral is pretrained with multilingual data using a context size of 32k tokens. It either matches or exceeds the performance of Llama 2 70B and GPT-3.5, over several benchmarks. In particular, Mixture of Experts Layer i gating inputs af outputs router expert\n\nThis chunk describes the key architectural details of the Mixtral model, a sparse mixture-of-experts language model that outperforms larger models like Llama 2 70B and GPT-3.5 on various benchmarks.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 0, "text": "4 2 0 2 n a J 8 ] G L . s c [ 1 v 8 8 0 4 0 . 1 0 4 2 : v i X r a # Mixtral of Experts Albert Q. Jiang, Alexandre Sablayrolles, Antoine Roux, Arthur Mensch, Blanche Savary, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Emma Bou Hanna, Florian Bressand, Gianna Lengyel, Guillaume Bour, Guillaume Lample, L\u00c3\u00a9lio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Sandeep Subramanian, Sophia Yang, Szymon Antoniak, Teven Le Scao, Th\u00c3\u00a9ophile Gervet, Thibaut Lavril, Thomas Wang, Timoth\u00c3\u00a9e Lacroix, William El Sayed Abstract We introduce Mixtral 8x7B, a Sparse Mixture of Experts (SMoE) language model. Mixtral has the same architecture as Mistral 7B, with the difference that each layer is composed of 8 feedforward blocks (i.e. experts). For every token, at each layer, a router network selects two experts to process the current state and combine their outputs. Even though each token only sees two experts, the selected experts can be different at each timestep. As a result, each token has access to 47B parameters, but only uses 13B active parameters during inference. Mixtral was trained with a context size of 32k tokens and it outperforms or matches Llama 2 70B and GPT-3.5 across all evaluated benchmarks. In particular, Mixtral vastly outperforms Llama 2 70B on mathematics, code generation, and multilingual benchmarks. We also provide a model fine- tuned to follow instructions, Mixtral 8x7B \u00e2 Instruct, that surpasses GPT-3.5 Turbo, Claude-2.1, Gemini Pro, and Llama 2 70B \u00e2\n\nThis chunk introduces Mixtral 8x7B, a sparse mixture of experts language model that outperforms Llama 2 70B and GPT-3.5 on various benchmarks. It also describes the model architecture and the fine-tuned Mixtral 8x7B - Instruct model.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 15, "text": "3.8 \u00e2 Mixtral_8x7B 3.5 32 > $3.0 i] 228 fos a 2.0 0 5k 10k 15k 20k 25k 30k Context length Passkey Performance ry 3.8 \u00e2 Mixtral_8x7B 3.5 0.8 32 > 0.6 $3.0 i] 228 04 fos 0.2 a 2.0 0.0 OK 4K 8K 12K 16K 20K 24K 28K 0 5k 10k 15k 20k 25k 30k Seq Len Context length Figure 4: Long range performance of Mixtral. (Left) Mixtral has 100% retrieval accuracy of the Passkey task regardless of the location of the passkey and length of the input sequence. (Right) The perplexity of Mixtral on the proof-pile dataset decreases monotonically as the context length increases.\n\nThe chunk discusses the long-range performance of the Mixtral model, demonstrating its ability to retrieve a passkey regardless of its location in a long input sequence, and showing that the model's perplexity on the proof-pile dataset decreases as the context length increases.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 12, "text": "Size and Efficiency. We compare our performance to the Llama 2 family, aiming to understand Mixtral models\u00e2 efficiency in the cost-performance spectrum (see Figure 3). As a sparse Mixture- of-Experts model, Mixtral only uses 13B active parameters for each token. With 5x lower active parameters, Mixtral is able to outperform Llama 2 70B across most categories. Note that this analysis focuses on the active parameter count (see Section 2.1), which is directly proportional to the inference compute cost, but does not consider the memory costs and hardware utilization. The memory costs for serving Mixtral are proportional to its sparse parameter count, 47B, which is still smaller than Llama 2 70B. As for device utilization, we note that the SMoEs layer introduces additional overhead due to the routing mechanism and due to the increased memory loads when running more than one expert per device. They are more suitable for batched workloads where one can reach a good degree of arithmetic intensity. Comparison with Llama 2 70B and GPT-3.5. In Table 3, we report the performance of Mixtral 8x7B compared to Llama 2 70B and GPT-3.5. We observe that Mixtral performs similarly or above the two other models. On MMLU, Mixtral obtains a better performance, despite its significantly smaller capacity (47B tokens compared to 70B). For MT Bench, we report the performance of the latest GPT-3.5-Turbo model available, gpt-3.5-turbo-1106. 2Since Llama 2 34B was not open-sourced, we report results for Llama 1 34B.\n\nThis chunk discusses the size and efficiency of the Mixtral model, comparing its performance to the Llama 2 family of models. It highlights that Mixtral, as a sparse mixture-of-experts model, uses significantly fewer active parameters than Llama 2 70B while outperforming it across most benchmarks. The chunk also compares the performance of Mixtral 8x7B to Llama 2 70B and GPT-3.5.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 4, "text": "Instruct under the Apache 2.0 license1, free for academic and commercial usage, ensuring broad accessibility and potential for diverse applications. To enable the community to run Mixtral with a fully open-source stack, we submitted changes to the vLLM project, which integrates Megablocks CUDA kernels for efficient inference. Skypilot also allows the deployment of vLLM endpoints on any instance in the cloud. # 2 Architectural details Mixtral is based on a transformer architecture [31] and uses the same modifications as described in [18], with the notable exceptions that Mix- tral supports a fully dense context length of 32k tokens, and the feed- forward blocks are replaced by Mixture-of-Expert layers (Section 2.1). The model architecture parameters are summarized in Table 1.\n\nThis chunk describes the architectural details of the Mixtral language model, including its use of a transformer architecture with a 32k token context length and mixture-of-expert layers. It also mentions the model's open-source licensing and deployment options.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 41, "text": "Hellaswag: Can a machine really finish your sentence? arXiv preprint arXiv:1905.07830, 2019. [33] Lianmin Zheng, Wei-Lin Chiang, Ying Sheng, Siyuan Zhuang, Zhanghao Wu, Yonghao Zhuang, Zi Lin, Zhuohan Li, Dacheng Li, Eric Xing, et al. Judging llm-as-a-judge with mt-bench and chatbot arena. arXiv preprint arXiv:2306.05685, 2023.\n\nThe chunk discusses two references related to language model benchmarking, including the Hellaswag dataset and the MT-Bench and Chatbot Arena benchmarks. This is situated within the broader context of the paper, which introduces the Mixtral language model and evaluates its performance on various benchmarks.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 3, "text": "Figure 1: Mixture of Experts Layer. Each input vector is assigned to 2 of the 8 experts by a router. The layer\u00e2 s output is the weighted sum of the outputs of the two selected experts. In Mixtral, an expert is a standard feedforward block as in a vanilla transformer architecture. Mixtral demonstrates superior capabilities in mathematics, code generation, and tasks that require multilingual understanding, significantly outperforming Llama 2 70B in these domains. Experiments show that Mixtral is able to successfully retrieve information from its context window of 32k tokens, regardless of the sequence length and the location of the information in the sequence. We also present Mixtral 8x7B \u00e2 Instruct, a chat model fine-tuned to follow instructions using supervised fine-tuning and Direct Preference Optimization [25]. Its performance notably surpasses that of GPT-3.5 Turbo, Claude-2.1, Gemini Pro, and Llama 2 70B \u00e2 chat model on human evaluation benchmarks. Mixtral \u00e2 Instruct also demonstrates reduced biases, and a more balanced sentiment profile in benchmarks such as BBQ, and BOLD. We release both Mixtral 8x7B and Mixtral 8x7B \u00e2\n\nThis chunk describes the Mixture of Experts layer architecture used in the Mixtral model, as well as the superior performance of Mixtral compared to other models on various benchmarks, including mathematics, code generation, and multilingual tasks. It also introduces the Mixtral 8x7B - Instruct model, which is fine-tuned to follow instructions and outperforms other chat models on human evaluation benchmarks.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 30, "text": "Quac: Question answering in context. arXiv preprint arXiv:1808.07036, 2018. [6] Aidan Clark, Diego De Las Casas, Aurelia Guy, Arthur Mensch, Michela Paganini, Jordan Hoffmann, Bogdan Damoc, Blake Hechtman, Trevor Cai, Sebastian Borgeaud, et al. Unified scaling laws for routed language models. In International Conference on Machine Learning, pages 4057\u00e2 4086. PMLR, 2022. [7] Christopher Clark, Kenton Lee, Ming-Wei Chang, Tom Kwiatkowski, Michael Collins, and Kristina Toutanova.\n\nThe chunk discusses references related to question answering and language models, which are relevant topics covered in the overall document.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}, {"id": 37, "text": "Landmark attention: Random-access infinite context length for transformers. arXiv preprint arXiv:2305.16300, 2023. [24] Alicia Parrish, Angelica Chen, Nikita Nangia, Vishakh Padmakumar, Jason Phang, Jana Thomp- son, Phu Mon Htut, and Samuel R Bowman. Bbq: A hand-built bias benchmark for question answering. arXiv preprint arXiv:2110.08193, 2021. [25] Rafael Rafailov, Archit Sharma, Eric Mitchell, Stefano Ermon, Christopher D Manning, and Chelsea Finn. Direct preference optimization: Your language model is secretly a reward model. arXiv preprint arXiv:2305.18290, 2023. [26] Keisuke Sakaguchi, Ronan Le Bras, Chandra Bhagavatula, and Yejin Choi.\n\nThe chunk discusses references related to long-range performance, bias benchmarks, and instruction fine-tuning of language models, which are topics covered in the overall document.", "metadata": {"title": "Mixtral of Experts", "arxiv_id": "2401.04088", "references": ["1905.07830"]}}] -------------------------------------------------------------------------------- /exercise/01_simple_rag_exercise.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Simple RAG Exercise\n", 8 | "\n", 9 | "Build a simple RAG flow to recommend oldie movies based on user's requests. The dataset includes 5,000 movies descriptions. In the exercise, you will learn to add a filter to the semantic retrieval and the data columns sent to the generation step.\n", 10 | "\n", 11 | "Fill in the empty cells, and answer the questions on the course site." 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 1, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "from rich.console import Console\n", 21 | "from rich_theme_manager import Theme, ThemeManager\n", 22 | "import pathlib\n", 23 | "\n", 24 | "theme_dir = pathlib.Path(\"../themes\")\n", 25 | "theme_manager = ThemeManager(theme_dir=theme_dir)\n", 26 | "dark = theme_manager.get(\"dark\")\n", 27 | "\n", 28 | "# Create a console with the dark theme\n", 29 | "console = Console(theme=dark)" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "metadata": {}, 35 | "source": [ 36 | "## Loading the Movie Dataset\n", 37 | "\n", 38 | "We will load the moview dataset from Hugging Face hub in:\n", 39 | "https://huggingface.co/datasets/AiresPucrs/tmdb-5000-movies" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 2, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "from datasets import load_dataset\n", 49 | "\n", 50 | "### YOUR CODE HERE ###" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "console.print(dataset)" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "## Encode using Vector Embedding\n", 67 | "\n", 68 | "We will use one of the popular open source vector databases, [Qdrant](https://qdrant.tech/), and one of the popular embedding encoder and text transformer libraries, [SentenceTransformer](https://sbert.net/).\n", 69 | "\n", 70 | "This time we will use the following sentence similarity model:\n", 71 | "https://huggingface.co/sentence-transformers/all-mpnet-base-v2" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "from qdrant_client import models, QdrantClient\n", 81 | "from sentence_transformers import SentenceTransformer\n", 82 | "\n", 83 | "# create the vector database client\n", 84 | "qdrant = QdrantClient(\":memory:\") # Create in-memory Qdrant instance\n", 85 | "\n", 86 | "# Create the embedding encoder\n", 87 | "### YOUR CODE HERE ###\n" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "console.print(encoder)" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "# Create collection to store the wine rating data\n", 106 | "collection_name=\"movies\"\n", 107 | "\n", 108 | "qdrant.recreate_collection(\n", 109 | " collection_name=collection_name,\n", 110 | " vectors_config=models.VectorParams(\n", 111 | " size=encoder.get_sentence_embedding_dimension(), # Vector size is defined by used model\n", 112 | " distance=models.Distance.COSINE\n", 113 | " )\n", 114 | ")" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": {}, 120 | "source": [ 121 | "### Loading the data into the vector database\n", 122 | "\n", 123 | "We will use the collection that we created above, to go over all the rows and encode the `overview` column of the wine dataset, encode it with the encoder into embedding vector, and store it in the vector database. Please use the index of the movie from the dataset (`id` column) as the `id` in the vector index.\n", 124 | "\n", 125 | "Please note that some of the rows are missing the `overview`. You should ignore them and not upload them into the vector database index.\n", 126 | "\n", 127 | "This step will take a few seconds (less than a minute on my laptop)." 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 7, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "# vectorize!\n", 137 | "qdrant.upload_points(\n", 138 | " collection_name=collection_name,\n", 139 | " points=[\n", 140 | "### YOUR CODE HERE ###\n", 141 | " ]\n", 142 | ")" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": null, 148 | "metadata": {}, 149 | "outputs": [], 150 | "source": [ 151 | "console.print(\n", 152 | " qdrant\n", 153 | " .get_collection(\n", 154 | " collection_name=collection_name\n", 155 | " )\n", 156 | ")" 157 | ] 158 | }, 159 | { 160 | "cell_type": "markdown", 161 | "metadata": {}, 162 | "source": [ 163 | "## **R**etrieve sematically relevant data based on user's query\n", 164 | "\n", 165 | "Once the data is loaded into the vector database and the indexing process is done, we can start using our simple RAG system." 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 49, 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [ 174 | "user_prompt = \"Love story between an Asian king and European teacher\"" 175 | ] 176 | }, 177 | { 178 | "cell_type": "markdown", 179 | "metadata": {}, 180 | "source": [ 181 | "### Encoding the user's query\n", 182 | "\n", 183 | "We will use the same encoder that we used to encode the document data to encode the query of the user. \n", 184 | "This way we can search results based on semantic similarity. " 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 50, 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [ 193 | "query_vector = encoder.encode(user_prompt).tolist()" 194 | ] 195 | }, 196 | { 197 | "cell_type": "markdown", 198 | "metadata": {}, 199 | "source": [ 200 | "### Create filter on the results\n", 201 | "\n", 202 | "We only want movies from the '90s. Please create a filter base on the `release_date` column. Check the Qdrant documentation in: https://qdrant.tech/documentation/concepts/filtering/#datetime-range" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": 51, 208 | "metadata": {}, 209 | "outputs": [], 210 | "source": [ 211 | "from qdrant_client import models\n", 212 | "\n", 213 | "query_filter= models.Filter(\n", 214 | "### YOUR CODE HERE\n", 215 | " )" 216 | ] 217 | }, 218 | { 219 | "cell_type": "markdown", 220 | "metadata": {}, 221 | "source": [ 222 | "### Search similar rows\n", 223 | "\n", 224 | "We can now take the embedding encoding of the user's query and use it to find similar rows in the vector database." 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": 54, 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [ 233 | "# Search time for awesome wines!\n", 234 | "\n", 235 | "hits = qdrant.search(\n", 236 | " collection_name=collection_name,\n", 237 | " query_vector=query_vector,\n", 238 | " limit=1,\n", 239 | " query_filter=query_filter,\n", 240 | ")" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": null, 246 | "metadata": {}, 247 | "outputs": [], 248 | "source": [ 249 | "from rich.text import Text\n", 250 | "from rich.table import Table\n", 251 | "\n", 252 | "table = Table(title=\"Retrieval Results\", show_lines=True)\n", 253 | "\n", 254 | "table.add_column(\"ID\", style=\"#e0e0e0\")\n", 255 | "table.add_column(\"Original Title\", style=\"#e0e0e0\")\n", 256 | "table.add_column(\"Overview\", style=\"bright_red\")\n", 257 | "table.add_column(\"Score\", style=\"#89ddff\")\n", 258 | "\n", 259 | "for hit in hits:\n", 260 | " table.add_row(\n", 261 | " str(hit.payload[\"id\"]),\n", 262 | " hit.payload[\"original_title\"],\n", 263 | " f'{hit.payload[\"overview\"]}',\n", 264 | " f\"{hit.score:.4f}\"\n", 265 | " )\n", 266 | "\n", 267 | "console.print(table)" 268 | ] 269 | }, 270 | { 271 | "cell_type": "markdown", 272 | "metadata": {}, 273 | "source": [ 274 | "## **A**ugment the prompt to the LLM with retrieved data\n", 275 | "\n", 276 | "In our simple example, we will simply take the top result and use it in the prompt to the generation LLM. We will filter some of the columns and keep only the following:\n", 277 | "* `original_title`\n", 278 | "* `title`\n", 279 | "* `overview`\n", 280 | "* `release_date`\n", 281 | "* `popularity`" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": 56, 287 | "metadata": {}, 288 | "outputs": [], 289 | "source": [ 290 | "# define a variable to hold the search results with specific fields\n", 291 | "search_results = [\n", 292 | " {\n", 293 | "### YOUR CODE HERE\n", 294 | " } for hit in hits]" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": null, 300 | "metadata": {}, 301 | "outputs": [], 302 | "source": [ 303 | "console.print(search_results)" 304 | ] 305 | }, 306 | { 307 | "cell_type": "markdown", 308 | "metadata": {}, 309 | "source": [ 310 | "## **G**enerate reply to the user's query\n", 311 | "\n", 312 | "We will use GPT-4 from [OpenAI](https://platform.openai.com/docs/models). Please write the prompt to instruct the LLM to write the recommendations based on the search results." 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": 58, 318 | "metadata": {}, 319 | "outputs": [ 320 | { 321 | "data": { 322 | "text/html": [ 323 | "
╭────────────────────────────────────── Movie Recommendation with Retrieval ──────────────────────────────────────╮\n",
324 |        "                                                                                                                 \n",
325 |        " The movie you're looking for is 'Anna and the King' (1999). The film depicts the unique relationship between    \n",
326 |        " the King of Siam (now Thailand) and a widowed British school teacher Anna Leonowens during the 1860's. Anna     \n",
327 |        " teaches the king's many children and slowly develops a romantic yet controversial relationship with the king    \n",
328 |        " himself. This beautifully crafted story takes you through a rollercoaster of emotions, intrigue, and discovery  \n",
329 |        " of new cultures. You're sure to enjoy it.                                                                       \n",
330 |        "                                                                                                                 \n",
331 |        "╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
332 |        "
\n" 333 | ], 334 | "text/plain": [ 335 | "\u001b[93m╭─\u001b[0m\u001b[93m─────────────────────────────────────\u001b[0m\u001b[93m Movie Recommendation with Retrieval \u001b[0m\u001b[93m─────────────────────────────────────\u001b[0m\u001b[93m─╮\u001b[0m\n", 336 | "\u001b[93m│\u001b[0m \u001b[93m│\u001b[0m\n", 337 | "\u001b[93m│\u001b[0m The movie you're looking for is 'Anna and the King' (1999). The film depicts the unique relationship between \u001b[93m│\u001b[0m\n", 338 | "\u001b[93m│\u001b[0m the King of Siam (now Thailand) and a widowed British school teacher Anna Leonowens during the 1860's. Anna \u001b[93m│\u001b[0m\n", 339 | "\u001b[93m│\u001b[0m teaches the king's many children and slowly develops a romantic yet controversial relationship with the king \u001b[93m│\u001b[0m\n", 340 | "\u001b[93m│\u001b[0m himself. This beautifully crafted story takes you through a rollercoaster of emotions, intrigue, and discovery \u001b[93m│\u001b[0m\n", 341 | "\u001b[93m│\u001b[0m of new cultures. You're sure to enjoy it. \u001b[93m│\u001b[0m\n", 342 | "\u001b[93m│\u001b[0m \u001b[93m│\u001b[0m\n", 343 | "\u001b[93m╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n" 344 | ] 345 | }, 346 | "metadata": {}, 347 | "output_type": "display_data" 348 | } 349 | ], 350 | "source": [ 351 | "from openai import OpenAI\n", 352 | "from rich.panel import Panel\n", 353 | "\n", 354 | "client = OpenAI()\n", 355 | "completion = client.chat.completions.create(\n", 356 | " model=\"gpt-4\",\n", 357 | " messages=[\n", 358 | "### YOUR CODE HERE ###\n", 359 | " ]\n", 360 | ")\n", 361 | "\n", 362 | "response_text = Text(completion.choices[0].message.content)\n", 363 | "styled_panel = Panel(\n", 364 | " response_text,\n", 365 | " title=\"Movie Recommendation with Retrieval\",\n", 366 | " expand=False,\n", 367 | " border_style=\"bright_yellow\",\n", 368 | " padding=(1, 1)\n", 369 | ")\n", 370 | "\n", 371 | "console.print(styled_panel)" 372 | ] 373 | }, 374 | { 375 | "cell_type": "code", 376 | "execution_count": null, 377 | "metadata": {}, 378 | "outputs": [], 379 | "source": [] 380 | } 381 | ], 382 | "metadata": { 383 | "kernelspec": { 384 | "display_name": ".venv", 385 | "language": "python", 386 | "name": "python3" 387 | }, 388 | "language_info": { 389 | "codemirror_mode": { 390 | "name": "ipython", 391 | "version": 3 392 | }, 393 | "file_extension": ".py", 394 | "mimetype": "text/x-python", 395 | "name": "python", 396 | "nbconvert_exporter": "python", 397 | "pygments_lexer": "ipython3", 398 | "version": "3.11.0" 399 | } 400 | }, 401 | "nbformat": 4, 402 | "nbformat_minor": 2 403 | } 404 | -------------------------------------------------------------------------------- /exercise/02_advanced_chunking_exercise.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Advanced Chunking Exercise\n", 8 | "\n", 9 | "You will implement a RAG application for long and messy legal documents. You will implement the best practices you learned so far, including semantic chunking, and chunk enrichment. Then, you will implement semantic search and response generation with citation to the original documents." 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "### Visual improvements\n", 17 | "\n", 18 | "We will use [rich library](https://github.com/Textualize/rich) to make the output more readable, and supress warning messages." 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 1, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "from rich.console import Console\n", 28 | "from rich_theme_manager import Theme, ThemeManager\n", 29 | "import pathlib\n", 30 | "\n", 31 | "theme_dir = pathlib.Path(\"../themes\")\n", 32 | "theme_manager = ThemeManager(theme_dir=theme_dir)\n", 33 | "dark = theme_manager.get(\"dark\")\n", 34 | "\n", 35 | "# Create a console with the dark theme\n", 36 | "console = Console(theme=dark)" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 2, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "import warnings\n", 46 | "\n", 47 | "# Suppress warnings\n", 48 | "warnings.filterwarnings('ignore')" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "## Loading complex PDF documents\n", 56 | "\n", 57 | "You will load a complex legal PDF document from the [case.law](https://case.law/) website. This website has millions of legal documents, and we will load a random PDF file from that site with more than 1,000 pages. \n", 58 | "\n", 59 | "To parse the PDF file you will use a PDF processor library, [pymupdf4llm](https://pymupdf.readthedocs.io/en/latest/pymupdf4llm/), which makes it easy to extract text and other media from PDF files for RAG applications. " 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "import pymupdf4llm\n", 69 | "\n", 70 | "import requests\n", 71 | "import os\n", 72 | "\n", 73 | "random_doc_number = 196\n", 74 | "url = f\"https://static.case.law/wash-app/{random_doc_number}.pdf\"\n", 75 | "response = requests.get(url)\n", 76 | "\n", 77 | "data_folder = \"data\"\n", 78 | "if not os.path.exists(data_folder):\n", 79 | " os.makedirs(data_folder)\n", 80 | "\n", 81 | "with open(os.path.join(data_folder, f\"{random_doc_number}.pdf\"), \"wb\") as file:\n", 82 | " file.write(response.content)\n", 83 | "\n", 84 | "md_text = pymupdf4llm.to_markdown(f\"data/{random_doc_number}.pdf\", page_chunks=True)" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "metadata": {}, 90 | "source": [ 91 | "### Show a ramdom page from the document\n", 92 | "\n", 93 | "Let's check a random page from the PDF document and print its image and the extracted text." 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 5, 99 | "metadata": {}, 100 | "outputs": [ 101 | { 102 | "data": { 103 | "text/html": [ 104 | "\n", 105 | "
\n", 106 | "
\n", 107 | " \n", 108 | "
\n", 109 | "
\n", 110 | " \n", 111 | "

Extracted Text

\n", 112 | "

trial court that the jury authorized only this latter crime as\n", 113 | "\n", 114 | "a basis for sentencing.21\n", 115 | "\n", 116 | " - 4 The State argues that Morales’s reliance on Wil-\n", 117 | "\n", 118 | "liams-Walker is misplaced because CrR 7.8 was not dis-\n", 119 | "\n", 120 | "cussed in that case. That is the court rule on which the trial\n", 121 | "\n", 122 | "court relied to correct the jury verdict. This argument is not\n", 123 | "\n", 124 | "convincing.\n", 125 | "\n", 126 | " - 5 First, the fact that the supreme court did not discuss\n", 127 | "\n", 128 | "CrR 7.8 in Williams-Walker is not a persuasive distinguish-\n", 129 | "\n", 130 | "ing factor. There, the court held that the jury trial right\n", 131 | "\n", 132 | "included the right to be sentenced only on a basis autho-\n", 133 | "\n", 134 | "rized by a jury’s verdict. It did so on facts that are not\n", 135 | "\n", 136 | "materially distinguishable from those here.\n", 137 | "\n", 138 | " - 6 There, the jury verdict forms stated the “deadly\n", 139 | "\n", 140 | "weapon” enhancement, not the more serious “firearm” en-\n", 141 | "\n", 142 | "hancement. Trial courts in some of the cases sentenced on\n", 143 | "\n", 144 | "the basis of the latter, not the former, enhancement. The\n", 145 | "\n", 146 | "court held that was error.\n", 147 | "\n", 148 | " - 7 Here, the jury verdict stated Child Molestation in\n", 149 | "\n", 150 | "the Second Degree, not the more serious Child Molestation\n", 151 | "\n", 152 | "in the First Degree. The trial court sentenced on the basis of\n", 153 | "\n", 154 | "the more serious crime, not the one in the jury verdict.\n", 155 | "\n", 156 | " - 8 The underlying principle is the same: the jury ver-\n", 157 | "\n", 158 | "dict authorized only a sentence based on that verdict. The\n", 159 | "\n", 160 | "court based the sentence on a crime not authorized by the\n", 161 | "\n", 162 | "jury verdict.\n", 163 | "\n", 164 | " - 9 Second, we deal later in this opinion with the ques-\n", 165 | "\n", 166 | "tion whether CrR 7.8 is a proper remedy to correct an\n", 167 | "\n", 168 | "arguably erroneous jury verdict. That discussion deals more\n", 169 | "\n", 170 | "fully with the State’s argument.\n", 171 | "\n", 172 | " - 0 Given that there was an arguably erroneous jury\n", 173 | "\n", 174 | "verdict, we must decide whether the court had the authority\n", 175 | "\n", 176 | "to change it. Two supreme court cases provide guidance.\n", 177 | "\n", 178 | "\n", 179 | "-----\n", 180 | "\n", 181 | "

\n", 182 | "\n", 183 | "
\n", 184 | "
\n" 185 | ], 186 | "text/plain": [ 187 | "" 188 | ] 189 | }, 190 | "metadata": {}, 191 | "output_type": "display_data" 192 | } 193 | ], 194 | "source": [ 195 | "import fitz\n", 196 | "from IPython.display import display, HTML\n", 197 | "\n", 198 | "random_page_number = 149\n", 199 | "## Convert the PDF to an PNG image\n", 200 | "pdf_path = \"data/196.pdf\"\n", 201 | "pdf_document = fitz.open(pdf_path)\n", 202 | "page = pdf_document.load_page(random_page_number) # Page numbering starts from 0\n", 203 | "pix = page.get_pixmap()\n", 204 | "pix.save(\"random_page.png\")\n", 205 | "pdf_document.close()\n", 206 | "\n", 207 | "# Text content\n", 208 | "text_content = f\"\"\"\n", 209 | "

Extracted Text

\n", 210 | "

{md_text[random_page_number][\"text\"]}

\n", 211 | "\"\"\"\n", 212 | "\n", 213 | "# HTML layout for two columns to show the image and text side by side\n", 214 | "html_content = f\"\"\"\n", 215 | "
\n", 216 | "
\n", 217 | " \n", 218 | "
\n", 219 | "
\n", 220 | " {text_content}\n", 221 | "
\n", 222 | "
\n", 223 | "\"\"\"\n", 224 | "\n", 225 | "# Display in Jupyter notebook\n", 226 | "display(HTML(html_content))" 227 | ] 228 | }, 229 | { 230 | "cell_type": "markdown", 231 | "metadata": {}, 232 | "source": [ 233 | "You can see that the PDF processor extracts additional information on the document such as title, page count, etc. We can use this metadata for the metadata of our chunks in the vector database." 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "console.print(md_text[random_page_number])" 243 | ] 244 | }, 245 | { 246 | "cell_type": "markdown", 247 | "metadata": {}, 248 | "source": [ 249 | "## Split the documents into Chunks\n", 250 | "\n", 251 | "You will use the statistical chunker that we used in the hands-on lab. However, we want an encoder that is trained on legal document and can generate better embedding vectors to improve the retrieval results. For this exercise you will an encoder from Hugging Face hub: https://huggingface.co/nlpaueb/legal-bert-base-uncased." 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": null, 257 | "metadata": {}, 258 | "outputs": [], 259 | "source": [ 260 | "from semantic_router.encoders import HuggingFaceEncoder\n", 261 | "\n", 262 | "encoder = HuggingFaceEncoder(\n", 263 | " ### YOUR CODE HERE ###\n", 264 | ")\n", 265 | "console.print(encoder)\n" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": 7, 271 | "metadata": {}, 272 | "outputs": [ 273 | { 274 | "data": { 275 | "text/html": [ 276 | "
StatisticalChunker(\n",
277 |        "    name='statistical_chunker',\n",
278 |        "    encoder=HuggingFaceEncoder(\n",
279 |        "        name='nlpaueb/legal-bert-base-uncased',\n",
280 |        "        score_threshold=0.5,\n",
281 |        "        type='huggingface',\n",
282 |        "        tokenizer_kwargs={},\n",
283 |        "        model_kwargs={},\n",
284 |        "        device='cpu'\n",
285 |        "    ),\n",
286 |        "    threshold_adjustment=0.01,\n",
287 |        "    dynamic_threshold=True,\n",
288 |        "    window_size=5,\n",
289 |        "    plot_chunks=False,\n",
290 |        "    min_split_tokens=100,\n",
291 |        "    max_split_tokens=500,\n",
292 |        "    split_tokens_tolerance=10,\n",
293 |        "    enable_statistics=False\n",
294 |        ")\n",
295 |        "
\n" 296 | ], 297 | "text/plain": [ 298 | "\u001b[93mStatisticalChunker\u001b[0m\u001b[1m(\u001b[0m\n", 299 | " \u001b[1;38;2;232;125;62mname\u001b[0m=\u001b[92m'statistical_chunker'\u001b[0m,\n", 300 | " \u001b[1;38;2;232;125;62mencoder\u001b[0m=\u001b[93mHuggingFaceEncoder\u001b[0m\u001b[1m(\u001b[0m\n", 301 | " \u001b[1;38;2;232;125;62mname\u001b[0m=\u001b[92m'nlpaueb/legal-bert-base-uncased'\u001b[0m,\n", 302 | " \u001b[1;38;2;232;125;62mscore_threshold\u001b[0m=\u001b[91m0\u001b[0m\u001b[91m.5\u001b[0m,\n", 303 | " \u001b[1;38;2;232;125;62mtype\u001b[0m=\u001b[92m'huggingface'\u001b[0m,\n", 304 | " \u001b[1;38;2;232;125;62mtokenizer_kwargs\u001b[0m=\u001b[1m{\u001b[0m\u001b[1m}\u001b[0m,\n", 305 | " \u001b[1;38;2;232;125;62mmodel_kwargs\u001b[0m=\u001b[1m{\u001b[0m\u001b[1m}\u001b[0m,\n", 306 | " \u001b[1;38;2;232;125;62mdevice\u001b[0m=\u001b[92m'cpu'\u001b[0m\n", 307 | " \u001b[1m)\u001b[0m,\n", 308 | " \u001b[1;38;2;232;125;62mthreshold_adjustment\u001b[0m=\u001b[91m0\u001b[0m\u001b[91m.01\u001b[0m,\n", 309 | " \u001b[1;38;2;232;125;62mdynamic_threshold\u001b[0m=\u001b[3;92mTrue\u001b[0m,\n", 310 | " \u001b[1;38;2;232;125;62mwindow_size\u001b[0m=\u001b[91m5\u001b[0m,\n", 311 | " \u001b[1;38;2;232;125;62mplot_chunks\u001b[0m=\u001b[3;91mFalse\u001b[0m,\n", 312 | " \u001b[1;38;2;232;125;62mmin_split_tokens\u001b[0m=\u001b[91m100\u001b[0m,\n", 313 | " \u001b[1;38;2;232;125;62mmax_split_tokens\u001b[0m=\u001b[91m500\u001b[0m,\n", 314 | " \u001b[1;38;2;232;125;62msplit_tokens_tolerance\u001b[0m=\u001b[91m10\u001b[0m,\n", 315 | " \u001b[1;38;2;232;125;62menable_statistics\u001b[0m=\u001b[3;91mFalse\u001b[0m\n", 316 | "\u001b[1m)\u001b[0m\n" 317 | ] 318 | }, 319 | "metadata": {}, 320 | "output_type": "display_data" 321 | } 322 | ], 323 | "source": [ 324 | "from semantic_chunkers import StatisticalChunker\n", 325 | "import logging\n", 326 | "\n", 327 | "logging.disable(logging.CRITICAL)\n", 328 | "\n", 329 | "chunker = StatisticalChunker(\n", 330 | " encoder=encoder,\n", 331 | " min_split_tokens=100,\n", 332 | " max_split_tokens=500,\n", 333 | ")\n", 334 | "console.print(chunker)" 335 | ] 336 | }, 337 | { 338 | "cell_type": "markdown", 339 | "metadata": {}, 340 | "source": [ 341 | "### Chunking the full document text\n", 342 | "\n", 343 | "We will concatenate the text from all the pages of the document. We will insert the page number between the pages to allow the retrieval and then then the generation steps to create direct citation to the relevant page in the long document." 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": 8, 349 | "metadata": {}, 350 | "outputs": [], 351 | "source": [ 352 | "concatenated_text = \" \".join([page[\"text\"] + f\"\" for i, page in enumerate(md_text)])\n", 353 | "\n", 354 | "chunks = ### YOUR CODE HERE ###\n" 355 | ] 356 | }, 357 | { 358 | "cell_type": "markdown", 359 | "metadata": {}, 360 | "source": [ 361 | "How many chunks were created?" 362 | ] 363 | }, 364 | { 365 | "cell_type": "code", 366 | "execution_count": null, 367 | "metadata": {}, 368 | "outputs": [], 369 | "source": [ 370 | "### YOUR CODE HERE ###" 371 | ] 372 | }, 373 | { 374 | "cell_type": "markdown", 375 | "metadata": {}, 376 | "source": [ 377 | "Let's print a random chunk:" 378 | ] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": 10, 383 | "metadata": {}, 384 | "outputs": [ 385 | { 386 | "data": { 387 | "text/html": [ 388 | "
Chunk(\n",
389 |        "    splits=[\n",
390 |        "        '“',\n",
391 |        "        'Porter explained, [I]t is often our professional legal opinion',\n",
392 |        "        'that a loan modification is in the best interests of our',\n",
393 |        "        'clients.',\n",
394 |        "        'In such cases, it would be unethical not to assist',\n",
395 |        "        'clients with these services.”',\n",
396 |        "        '- Porter states the “mailer received with the complaint',\n",
397 |        "        'is the only solicitation! ] used by PLC in WA.” Porter',\n",
398 |        "        'identified eight Washington residents PLC assisted with',\n",
399 |        "        'residential loan modifications and provided a “copy of a',\n",
400 |        "        '-----',\n",
401 |        "        '<page_break_41> solicitation to Washington consumers.” Porter admitted',\n",
402 |        "        'charging each of the eight Washington residents “$3997 [+]',\n",
403 |        "        'a monthly maintenance fee for loan modification services',\n",
404 |        "        'rendered.” Porter identified Christopher Jason Mercado as',\n",
405 |        "        'the Washington attorney who provided the residential loan',\n",
406 |        "        'modification services to Washington residents.',\n",
407 |        "        '- Porter claimed the attorney exemption to the Mort-',\n",
408 |        "        'gage Broker Practices Act (MBPA), chapter 19.146 RCW,',\n",
409 |        "        'applied and filed a “Claim of Non-Applicability of the',\n",
410 |        "        'Mortgage Broker Practices Act.” Porter asserted PLC was',\n",
411 |        "        'not subject to the MBPA because “Porter Law Center assists',\n",
412 |        "        'clients with application for loan modification as part of the',\n",
413 |        "        'licensed practice of law in the State of Washington.” DFI',\n",
414 |        "        'investigators attempted to contact the eight Washington',\n",
415 |        "        'residents Porter identified and “ultimately spoke to two of'\n",
416 |        "    ],\n",
417 |        "    is_triggered=True,\n",
418 |        "    triggered_score=0.7121353335887549,\n",
419 |        "    token_count=253,\n",
420 |        "    metadata=None\n",
421 |        ")\n",
422 |        "
\n" 423 | ], 424 | "text/plain": [ 425 | "\u001b[93mChunk\u001b[0m\u001b[1m(\u001b[0m\n", 426 | " \u001b[1;38;2;232;125;62msplits\u001b[0m=\u001b[1m[\u001b[0m\n", 427 | " \u001b[92m'“'\u001b[0m,\n", 428 | " \u001b[92m'Porter explained, \u001b[0m\u001b[1;92m[\u001b[0m\u001b[92mI\u001b[0m\u001b[1;92m]\u001b[0m\u001b[92mt is often our professional legal opinion'\u001b[0m,\n", 429 | " \u001b[92m'that a loan modification is in the best interests of our'\u001b[0m,\n", 430 | " \u001b[92m'clients.'\u001b[0m,\n", 431 | " \u001b[92m'In such cases, it would be unethical not to assist'\u001b[0m,\n", 432 | " \u001b[92m'clients with these services.”'\u001b[0m,\n", 433 | " \u001b[92m'- Porter states the “mailer received with the complaint'\u001b[0m,\n", 434 | " \u001b[92m'is the only solicitation! \u001b[0m\u001b[1;92m]\u001b[0m\u001b[92m used by PLC in WA.” Porter'\u001b[0m,\n", 435 | " \u001b[92m'identified eight Washington residents PLC assisted with'\u001b[0m,\n", 436 | " \u001b[92m'residential loan modifications and provided a “copy of a'\u001b[0m,\n", 437 | " \u001b[92m'-----'\u001b[0m,\n", 438 | " \u001b[92m'\u001b[0m\u001b[1;92m<\u001b[0m\u001b[2;92mpage_break_41\u001b[0m\u001b[1;92m>\u001b[0m\u001b[92m solicitation to Washington consumers.” Porter admitted'\u001b[0m,\n", 439 | " \u001b[92m'charging each of the eight Washington residents “$3997 \u001b[0m\u001b[1;92m[\u001b[0m\u001b[92m+\u001b[0m\u001b[1;92m]\u001b[0m\u001b[92m'\u001b[0m,\n", 440 | " \u001b[92m'a monthly maintenance fee for loan modification services'\u001b[0m,\n", 441 | " \u001b[92m'rendered.” Porter identified Christopher Jason Mercado as'\u001b[0m,\n", 442 | " \u001b[92m'the Washington attorney who provided the residential loan'\u001b[0m,\n", 443 | " \u001b[92m'modification services to Washington residents.'\u001b[0m,\n", 444 | " \u001b[92m'- Porter claimed the attorney exemption to the Mort-'\u001b[0m,\n", 445 | " \u001b[92m'gage Broker Practices Act \u001b[0m\u001b[1;92m(\u001b[0m\u001b[92mMBPA\u001b[0m\u001b[1;92m)\u001b[0m\u001b[92m, chapter 19.146 RCW,'\u001b[0m,\n", 446 | " \u001b[92m'applied and filed a “Claim of Non-Applicability of the'\u001b[0m,\n", 447 | " \u001b[92m'Mortgage Broker Practices Act.” Porter asserted PLC was'\u001b[0m,\n", 448 | " \u001b[92m'not subject to the MBPA because “Porter Law Center assists'\u001b[0m,\n", 449 | " \u001b[92m'clients with application for loan modification as part of the'\u001b[0m,\n", 450 | " \u001b[92m'licensed practice of law in the State of Washington.” DFI'\u001b[0m,\n", 451 | " \u001b[92m'investigators attempted to contact the eight Washington'\u001b[0m,\n", 452 | " \u001b[92m'residents Porter identified and “ultimately spoke to two of'\u001b[0m\n", 453 | " \u001b[1m]\u001b[0m,\n", 454 | " \u001b[1;38;2;232;125;62mis_triggered\u001b[0m=\u001b[3;92mTrue\u001b[0m,\n", 455 | " \u001b[1;38;2;232;125;62mtriggered_score\u001b[0m=\u001b[91m0\u001b[0m\u001b[91m.7121353335887549\u001b[0m,\n", 456 | " \u001b[1;38;2;232;125;62mtoken_count\u001b[0m=\u001b[91m253\u001b[0m,\n", 457 | " \u001b[1;38;2;232;125;62mmetadata\u001b[0m=\u001b[2;37mNone\u001b[0m\n", 458 | "\u001b[1m)\u001b[0m\n" 459 | ] 460 | }, 461 | "metadata": {}, 462 | "output_type": "display_data" 463 | } 464 | ], 465 | "source": [ 466 | "console.print(chunks[0][5])" 467 | ] 468 | }, 469 | { 470 | "cell_type": "markdown", 471 | "metadata": {}, 472 | "source": [ 473 | "What is the average numebr of tokens in the chunks?" 474 | ] 475 | }, 476 | { 477 | "cell_type": "code", 478 | "execution_count": null, 479 | "metadata": {}, 480 | "outputs": [], 481 | "source": [ 482 | "### YOUR CODE HERE ###" 483 | ] 484 | }, 485 | { 486 | "cell_type": "markdown", 487 | "metadata": {}, 488 | "source": [ 489 | "## Enrich the chunk with context and metadata\n", 490 | "\n", 491 | "We will iterate over all the chunks. This can take some time based on the number of chunks.\n", 492 | "\n", 493 | "Since we want to be able to process a large number documents in our RAG system, we need to create a UUID that will used as the ID of the chunk within the vector database. The UUID is comprised of the URL of the document and the chunk index. This structure allows you to get a specific chunk index directly, whick will be improtant in the augmentation phase." 494 | ] 495 | }, 496 | { 497 | "cell_type": "code", 498 | "execution_count": 22, 499 | "metadata": {}, 500 | "outputs": [ 501 | { 502 | "name": "stderr", 503 | "output_type": "stream", 504 | "text": [ 505 | "Processing chunks: 100%|██████████| 2336/2336 [03:42<00:00, 10.51it/s]\n" 506 | ] 507 | } 508 | ], 509 | "source": [ 510 | "import uuid\n", 511 | "import re\n", 512 | "\n", 513 | "doc_url = url\n", 514 | "title = md_text[0][\"metadata\"][\"title\"]\n", 515 | "# Enrich the metadata with filters that are relevant for future retrieval queries.\n", 516 | "state = \"Washington\"\n", 517 | "\n", 518 | "from tqdm import tqdm\n", 519 | "\n", 520 | "def generate_uuid(doc_url, i):\n", 521 | " return str(uuid.uuid5(uuid.NAMESPACE_URL, f\"{doc_url}/{i}\"))\n", 522 | "\n", 523 | "corpus_json = []\n", 524 | "for i, chunk in tqdm(enumerate(chunks[0]), total=len(chunks[0]), desc=\"Processing chunks\"):\n", 525 | " chunk_text = ' '.join(chunk.splits)\n", 526 | " # Extract the page number from the page breaks\n", 527 | " page_match = re.search(r'', chunk_text)\n", 528 | " page = page_match.group(1) if page_match else 0\n", 529 | " chunk_uuid = generate_uuid(doc_url, i)\n", 530 | " corpus_json.append({\n", 531 | " \"id\": chunk_uuid,\n", 532 | " \"document\": chunk_text,\n", 533 | " # Add the title of the document to the chunk text for embedding\n", 534 | " \"embedding\": encoder([f\"{title} \\n {chunk_text}\"])[0],\n", 535 | " \"metadata\" : {\n", 536 | " \"title\": title,\n", 537 | " \"state\": state,\n", 538 | " \"doc_url\": doc_url,\n", 539 | " \"chunk_index\": i,\n", 540 | " \"page\": page,\n", 541 | " }\n", 542 | " })\n" 543 | ] 544 | }, 545 | { 546 | "cell_type": "markdown", 547 | "metadata": {}, 548 | "source": [ 549 | "## Loading into a Vector Database\n", 550 | "\n", 551 | "You will use a new vector data, [Chroma](https://github.com/chroma-core/chroma). It can illustrate the modularity of the RAG application, and the similar concepts across the providers." 552 | ] 553 | }, 554 | { 555 | "cell_type": "markdown", 556 | "metadata": {}, 557 | "source": [ 558 | "### Creating the collection \n", 559 | "\n", 560 | "You will use the default values for this simpler exercise." 561 | ] 562 | }, 563 | { 564 | "cell_type": "code", 565 | "execution_count": 14, 566 | "metadata": {}, 567 | "outputs": [], 568 | "source": [ 569 | "import chromadb\n", 570 | "# setup Chroma in-memory, for easy prototyping. Can add persistence easily!\n", 571 | "client = chromadb.Client()\n", 572 | "\n", 573 | "# Create collection. get_collection, create_collection, delete_collection also available!\n", 574 | "collection_name = \"legal-pdfs\"\n", 575 | "collection = client.get_or_create_collection(collection_name)\n" 576 | ] 577 | }, 578 | { 579 | "cell_type": "markdown", 580 | "metadata": {}, 581 | "source": [ 582 | "### Unserting the documents\n", 583 | "\n", 584 | "We will use the embedding, metadata and documents that were calculated above." 585 | ] 586 | }, 587 | { 588 | "cell_type": "code", 589 | "execution_count": 15, 590 | "metadata": {}, 591 | "outputs": [], 592 | "source": [ 593 | "collection.add(\n", 594 | " documents=### YOUR CODE HERE ###,\n", 595 | " embeddings=[obj[\"embedding\"] for obj in corpus_json],\n", 596 | " metadatas=### YOUR CODE HERE ###,\n", 597 | " ### YOUR CODE HERE ###\n", 598 | ")" 599 | ] 600 | }, 601 | { 602 | "cell_type": "markdown", 603 | "metadata": {}, 604 | "source": [ 605 | "### Query the vector collection\n", 606 | "\n", 607 | "We will add an example filter to the query based on the metadata that we created for each chunk (`{\"state\": \"Washington\"}`)." 608 | ] 609 | }, 610 | { 611 | "cell_type": "code", 612 | "execution_count": 16, 613 | "metadata": {}, 614 | "outputs": [], 615 | "source": [ 616 | "query_text = \"cases about loan default\"\n", 617 | "query_embedding = ### YOUR CODE HERE ###" 618 | ] 619 | }, 620 | { 621 | "cell_type": "code", 622 | "execution_count": 17, 623 | "metadata": {}, 624 | "outputs": [], 625 | "source": [ 626 | "hits = collection.query(\n", 627 | " query_embeddings=query_embedding,\n", 628 | " n_results=5,\n", 629 | " where={\"state\": \"Washington\"},\n", 630 | ")\n" 631 | ] 632 | }, 633 | { 634 | "cell_type": "markdown", 635 | "metadata": {}, 636 | "source": [ 637 | "## Augmentation Step\n", 638 | "\n", 639 | "We suspect that the chunk context is too small and we want to concatenate the chunks around it, before we send the text to the generation step. " 640 | ] 641 | }, 642 | { 643 | "cell_type": "code", 644 | "execution_count": null, 645 | "metadata": {}, 646 | "outputs": [], 647 | "source": [ 648 | "console.print(hits)" 649 | ] 650 | }, 651 | { 652 | "cell_type": "markdown", 653 | "metadata": {}, 654 | "source": [ 655 | "### Augmenting the search result\n", 656 | "\n", 657 | "You will iterate over all the search results and prepare them to the generation step. The main augmentation is the concatenation of the sourounding chunks text." 658 | ] 659 | }, 660 | { 661 | "cell_type": "code", 662 | "execution_count": 19, 663 | "metadata": {}, 664 | "outputs": [], 665 | "source": [ 666 | "# define a variable to hold the search results with specific fields\n", 667 | "search_results = []\n", 668 | "\n", 669 | "for document, metadata in zip(hits[\"documents\"][0], hits[\"metadatas\"][0]):\n", 670 | " doc_url = metadata[\"doc_url\"]\n", 671 | " chunk_index = metadata[\"chunk_index\"]\n", 672 | " doc_id = generate_uuid(doc_url, chunk_index)\n", 673 | " # Calculate the chunk IDs of the previous and next chunks\n", 674 | " previous_chunk_id = ### YOUR CODE HERE ###\n", 675 | " next_chunk_id = ### YOUR CODE HERE ###\n", 676 | " # Get the chunks from the vector collection with the chunk ids.\n", 677 | " previous_chunk = collection.get(### YOUR CODE HERE ###)\n", 678 | " next_chunk = ### YOUR CODE HERE ###\n", 679 | " search_results.append({\n", 680 | " # Concatenate the previous, current, and next document chunks to form a single document\n", 681 | " \"document\": f\"{previous_chunk['documents'][0]} {document} {next_chunk['documents'][0]}\",\n", 682 | " \"metadata\": metadata,\n", 683 | " })" 684 | ] 685 | }, 686 | { 687 | "cell_type": "markdown", 688 | "metadata": {}, 689 | "source": [ 690 | "Let's print the first search result, before sending it to the generation model:" 691 | ] 692 | }, 693 | { 694 | "cell_type": "code", 695 | "execution_count": null, 696 | "metadata": {}, 697 | "outputs": [], 698 | "source": [ 699 | "console.print(search_results[0])" 700 | ] 701 | }, 702 | { 703 | "cell_type": "code", 704 | "execution_count": 22, 705 | "metadata": {}, 706 | "outputs": [ 707 | { 708 | "data": { 709 | "text/html": [ 710 | "
╭────────────────────────────────────── Reply to 'cases about loan default' ──────────────────────────────────────╮\n",
711 |        "                                                                                                                 \n",
712 |        " Here are some cases that discuss issues of loan default:                                                        \n",
713 |        "                                                                                                                 \n",
714 |        " 1. v. Stump: The case examines a rule in a particular context, considering related provisions and the whole     \n",
715 |        " statutory or rule-making scheme. (Washington Appellate Reports volume 196, Page 684)                            \n",
716 |        " [Source](https://static.case.law/wash-app/196.pdf)                                                              \n",
717 |        "                                                                                                                 \n",
718 |        " 2. Hassan Farah et al., v. Hertz Transporting, Inc., et al.: The details of this case aren't provided in the    \n",
719 |        " given excerpt but it might be relevant for you to look up more details. (Washington Appellate Reports volume    \n",
720 |        " 196, Page 205) [Source](https://static.case.law/wash-app/196.pdf)                                               \n",
721 |        "                                                                                                                 \n",
722 |        " 3. Town of Tekoa v. Reilly: This case is discussing property tax exemptions, which may be pertinent if there's  \n",
723 |        " a connection to property involved in the loan default. (Washington Appellate Reports volume 196, Page 136)      \n",
724 |        " [Source](https://static.case.law/wash-app/196.pdf))                                                             \n",
725 |        "                                                                                                                 \n",
726 |        " These cases may give some insight into handling similar situations involving loan defaults. Please provide more \n",
727 |        " specific details if you need cases relating to a particular aspect of loan default.                             \n",
728 |        "                                                                                                                 \n",
729 |        "╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\n",
730 |        "
\n" 731 | ], 732 | "text/plain": [ 733 | "\u001b[93m╭─\u001b[0m\u001b[93m─────────────────────────────────────\u001b[0m\u001b[93m Reply to 'cases about loan default' \u001b[0m\u001b[93m─────────────────────────────────────\u001b[0m\u001b[93m─╮\u001b[0m\n", 734 | "\u001b[93m│\u001b[0m \u001b[93m│\u001b[0m\n", 735 | "\u001b[93m│\u001b[0m Here are some cases that discuss issues of loan default: \u001b[93m│\u001b[0m\n", 736 | "\u001b[93m│\u001b[0m \u001b[93m│\u001b[0m\n", 737 | "\u001b[93m│\u001b[0m 1. v. Stump: The case examines a rule in a particular context, considering related provisions and the whole \u001b[93m│\u001b[0m\n", 738 | "\u001b[93m│\u001b[0m statutory or rule-making scheme. (Washington Appellate Reports volume 196, Page 684) \u001b[93m│\u001b[0m\n", 739 | "\u001b[93m│\u001b[0m [Source](https://static.case.law/wash-app/196.pdf) \u001b[93m│\u001b[0m\n", 740 | "\u001b[93m│\u001b[0m \u001b[93m│\u001b[0m\n", 741 | "\u001b[93m│\u001b[0m 2. Hassan Farah et al., v. Hertz Transporting, Inc., et al.: The details of this case aren't provided in the \u001b[93m│\u001b[0m\n", 742 | "\u001b[93m│\u001b[0m given excerpt but it might be relevant for you to look up more details. (Washington Appellate Reports volume \u001b[93m│\u001b[0m\n", 743 | "\u001b[93m│\u001b[0m 196, Page 205) [Source](https://static.case.law/wash-app/196.pdf) \u001b[93m│\u001b[0m\n", 744 | "\u001b[93m│\u001b[0m \u001b[93m│\u001b[0m\n", 745 | "\u001b[93m│\u001b[0m 3. Town of Tekoa v. Reilly: This case is discussing property tax exemptions, which may be pertinent if there's \u001b[93m│\u001b[0m\n", 746 | "\u001b[93m│\u001b[0m a connection to property involved in the loan default. (Washington Appellate Reports volume 196, Page 136) \u001b[93m│\u001b[0m\n", 747 | "\u001b[93m│\u001b[0m [Source](https://static.case.law/wash-app/196.pdf)) \u001b[93m│\u001b[0m\n", 748 | "\u001b[93m│\u001b[0m \u001b[93m│\u001b[0m\n", 749 | "\u001b[93m│\u001b[0m These cases may give some insight into handling similar situations involving loan defaults. Please provide more \u001b[93m│\u001b[0m\n", 750 | "\u001b[93m│\u001b[0m specific details if you need cases relating to a particular aspect of loan default. \u001b[93m│\u001b[0m\n", 751 | "\u001b[93m│\u001b[0m \u001b[93m│\u001b[0m\n", 752 | "\u001b[93m╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯\u001b[0m\n" 753 | ] 754 | }, 755 | "metadata": {}, 756 | "output_type": "display_data" 757 | } 758 | ], 759 | "source": [ 760 | "from openai import OpenAI\n", 761 | "from rich.panel import Panel\n", 762 | "from rich.text import Text\n", 763 | "\n", 764 | "client = OpenAI()\n", 765 | "system_message = \"\"\"\n", 766 | "You are a paralegal specialist. \n", 767 | "### YOUR CODE HERE ###\n", 768 | "### YOUR CODE HERE ###\n", 769 | "\"\"\"\n", 770 | "completion = client.chat.completions.create(\n", 771 | " model=\"gpt-4\",\n", 772 | " messages=[\n", 773 | " {\"role\": \"system\", \"content\": system_message},\n", 774 | " {\"role\": \"user\", \"content\": query_text},\n", 775 | " {\"role\": \"assistant\", \"content\": str(search_results)}\n", 776 | " ]\n", 777 | ")\n", 778 | "\n", 779 | "response_text = Text(completion.choices[0].message.content)\n", 780 | "styled_panel = Panel(\n", 781 | " response_text,\n", 782 | " title=f\"Reply to '{query_text}'\",\n", 783 | " expand=False,\n", 784 | " border_style=\"bright_yellow\",\n", 785 | " padding=(1, 1)\n", 786 | ")\n", 787 | "\n", 788 | "console.print(styled_panel)" 789 | ] 790 | }, 791 | { 792 | "cell_type": "code", 793 | "execution_count": null, 794 | "metadata": {}, 795 | "outputs": [], 796 | "source": [] 797 | } 798 | ], 799 | "metadata": { 800 | "kernelspec": { 801 | "display_name": ".venv", 802 | "language": "python", 803 | "name": "python3" 804 | }, 805 | "language_info": { 806 | "codemirror_mode": { 807 | "name": "ipython", 808 | "version": 3 809 | }, 810 | "file_extension": ".py", 811 | "mimetype": "text/x-python", 812 | "name": "python", 813 | "nbconvert_exporter": "python", 814 | "pygments_lexer": "ipython3", 815 | "version": "3.11.0" 816 | } 817 | }, 818 | "nbformat": 4, 819 | "nbformat_minor": 2 820 | } 821 | -------------------------------------------------------------------------------- /exercise/random_page.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guyernest/advanced-rag/63d9ec59daf82e60023d3a336988e32d6b6c1d4a/exercise/random_page.png -------------------------------------------------------------------------------- /images/Advanced_RAG.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guyernest/advanced-rag/63d9ec59daf82e60023d3a336988e32d6b6c1d4a/images/Advanced_RAG.png -------------------------------------------------------------------------------- /images/Hybrid_Search.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guyernest/advanced-rag/63d9ec59daf82e60023d3a336988e32d6b6c1d4a/images/Hybrid_Search.png -------------------------------------------------------------------------------- /images/Naiive_RAG.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guyernest/advanced-rag/63d9ec59daf82e60023d3a336988e32d6b6c1d4a/images/Naiive_RAG.png -------------------------------------------------------------------------------- /images/Recall_Precision_in_RAG_Diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guyernest/advanced-rag/63d9ec59daf82e60023d3a336988e32d6b6c1d4a/images/Recall_Precision_in_RAG_Diagram.png -------------------------------------------------------------------------------- /images/advanced-rag-setup.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guyernest/advanced-rag/63d9ec59daf82e60023d3a336988e32d6b6c1d4a/images/advanced-rag-setup.gif -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "advanced-rag" 3 | version = "0.1.0" 4 | description = "Content for Mastering LLM with Advanced RAG online course." 5 | readme = "README.md" 6 | requires-python = ">=3.8" 7 | dependencies = [ 8 | "ruff>=0.6.7", 9 | ] 10 | -------------------------------------------------------------------------------- /requirements.in: -------------------------------------------------------------------------------- 1 | # ipykernel # Needed for the Jupyter notebooks used 2 | openai # Needed for the embedding and LLM calls 3 | anthropic # Needed for the chunking and LLM calls 4 | cohere # Needed for reranking exercise 5 | numpy # Needed for speeding up mathematical calculations 6 | scikit-learn # Needed for the cosine distance calculations 7 | python-dotenv # To handle secrets such as API keys 8 | # open-webui # To provide simple UI to interact with the LLM models 9 | qdrant-client # Local vector database 10 | einops # To load Transformer models 11 | sentence-transformers # sentence parsing and embedding 12 | # sentence-transformers-impl # For the community models 13 | semantic-router==0.0.37 # For semantic chunking 14 | semantic-router[processing] # For semantic chunking plots 15 | semantic-chunkers 16 | datasets==2.19.0 # Loading datasets for semantic chunking 17 | altair 18 | seaborn 19 | tabulate 20 | matplotlib 21 | vegafusion-python-embed>=1.5.0 # For large dataset (TNSE output) for Altair 22 | vegafusion>=1.5.0 # For large dataset (TNSE output) for Altair 23 | vl-convert-python>=1.6.0 # For large dataset (TNSE output) for Altair 24 | # nbformat 25 | # ipywidgets 26 | # For the multi-modal notebook 27 | colpali_engine>=0.3.1 28 | huggingface_hub[hf_transfer] 29 | transformers>=4.45.0 30 | stamina 31 | rich 32 | rich-theme-manager 33 | bm25s # For Hybrid Search 34 | PyStemmer # for BM25 Hybrid Search 35 | pdf2image # for multi modal demo 36 | pymupdf4llm # For PDF exercise -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # This file was autogenerated by uv via the following command: 2 | # uv pip compile requirements.in --output-file requirements.txt 3 | accelerate==1.0.0 4 | # via peft 5 | aiohappyeyeballs==2.4.2 6 | # via aiohttp 7 | aiohttp==3.10.5 8 | # via 9 | # cohere 10 | # datasets 11 | # fsspec 12 | aiosignal==1.3.1 13 | # via aiohttp 14 | altair==5.4.1 15 | # via 16 | # -r requirements.in 17 | # vegafusion 18 | annotated-types==0.7.0 19 | # via pydantic 20 | anthropic==0.39.0 21 | # via -r requirements.in 22 | anyio==4.6.0 23 | # via 24 | # anthropic 25 | # httpx 26 | # openai 27 | attrs==24.2.0 28 | # via 29 | # aiohttp 30 | # jsonschema 31 | # referencing 32 | backoff==2.2.1 33 | # via cohere 34 | bm25s==0.2.2 35 | # via -r requirements.in 36 | certifi==2024.8.30 37 | # via 38 | # httpcore 39 | # httpx 40 | # requests 41 | charset-normalizer==3.3.2 42 | # via requests 43 | cohere==4.57 44 | # via 45 | # -r requirements.in 46 | # semantic-router 47 | colorama==0.4.6 48 | # via 49 | # semantic-chunkers 50 | # semantic-router 51 | colorlog==6.8.2 52 | # via 53 | # semantic-chunkers 54 | # semantic-router 55 | colpali-engine==0.3.1 56 | # via -r requirements.in 57 | contourpy==1.3.0 58 | # via matplotlib 59 | cycler==0.12.1 60 | # via matplotlib 61 | datasets==2.19.0 62 | # via -r requirements.in 63 | dill==0.3.8 64 | # via 65 | # datasets 66 | # multiprocess 67 | distro==1.9.0 68 | # via 69 | # anthropic 70 | # openai 71 | einops==0.8.0 72 | # via -r requirements.in 73 | fastavro==1.9.7 74 | # via cohere 75 | filelock==3.16.1 76 | # via 77 | # datasets 78 | # huggingface-hub 79 | # torch 80 | # transformers 81 | fonttools==4.54.1 82 | # via matplotlib 83 | frozenlist==1.4.1 84 | # via 85 | # aiohttp 86 | # aiosignal 87 | fsspec==2024.3.1 88 | # via 89 | # datasets 90 | # huggingface-hub 91 | # torch 92 | gputil==1.4.0 93 | # via colpali-engine 94 | grpcio==1.66.1 95 | # via 96 | # grpcio-tools 97 | # qdrant-client 98 | grpcio-tools==1.62.3 99 | # via qdrant-client 100 | h11==0.14.0 101 | # via httpcore 102 | h2==4.1.0 103 | # via httpx 104 | hf-transfer==0.1.8 105 | # via huggingface-hub 106 | hpack==4.0.0 107 | # via h2 108 | httpcore==1.0.5 109 | # via httpx 110 | httpx==0.27.2 111 | # via 112 | # anthropic 113 | # openai 114 | # qdrant-client 115 | huggingface-hub==0.25.1 116 | # via 117 | # -r requirements.in 118 | # accelerate 119 | # datasets 120 | # peft 121 | # sentence-transformers 122 | # tokenizers 123 | # transformers 124 | hyperframe==6.0.1 125 | # via h2 126 | idna==3.10 127 | # via 128 | # anyio 129 | # httpx 130 | # requests 131 | # yarl 132 | importlib-metadata==6.11.0 133 | # via cohere 134 | jinja2==3.1.4 135 | # via 136 | # altair 137 | # torch 138 | jiter==0.5.0 139 | # via 140 | # anthropic 141 | # openai 142 | joblib==1.4.2 143 | # via scikit-learn 144 | jsonschema==4.23.0 145 | # via altair 146 | jsonschema-specifications==2023.12.1 147 | # via jsonschema 148 | kiwisolver==1.4.7 149 | # via matplotlib 150 | markdown-it-py==3.0.0 151 | # via rich 152 | markupsafe==2.1.5 153 | # via jinja2 154 | matplotlib==3.9.2 155 | # via 156 | # -r requirements.in 157 | # seaborn 158 | # semantic-router 159 | mdurl==0.1.2 160 | # via markdown-it-py 161 | mpmath==1.3.0 162 | # via sympy 163 | multidict==6.1.0 164 | # via 165 | # aiohttp 166 | # yarl 167 | multiprocess==0.70.16 168 | # via datasets 169 | narwhals==1.9.3 170 | # via altair 171 | networkx==3.3 172 | # via torch 173 | numpy==1.26.4 174 | # via 175 | # -r requirements.in 176 | # accelerate 177 | # bm25s 178 | # colpali-engine 179 | # contourpy 180 | # datasets 181 | # matplotlib 182 | # pandas 183 | # peft 184 | # pyarrow 185 | # qdrant-client 186 | # scikit-learn 187 | # scipy 188 | # seaborn 189 | # semantic-chunkers 190 | # semantic-router 191 | # sentence-transformers 192 | # transformers 193 | openai==1.48.0 194 | # via 195 | # -r requirements.in 196 | # semantic-router 197 | packaging==23.2 198 | # via 199 | # accelerate 200 | # altair 201 | # datasets 202 | # huggingface-hub 203 | # matplotlib 204 | # peft 205 | # transformers 206 | pandas==2.2.2 207 | # via 208 | # datasets 209 | # seaborn 210 | # vegafusion 211 | pdf2image==1.17.0 212 | # via -r requirements.in 213 | peft==0.11.1 214 | # via colpali-engine 215 | pillow==10.4.0 216 | # via 217 | # colpali-engine 218 | # matplotlib 219 | # pdf2image 220 | # sentence-transformers 221 | portalocker==2.10.1 222 | # via qdrant-client 223 | protobuf==4.25.5 224 | # via 225 | # grpcio-tools 226 | # vegafusion 227 | psutil==6.0.0 228 | # via 229 | # accelerate 230 | # peft 231 | # vegafusion 232 | pyarrow==17.0.0 233 | # via 234 | # datasets 235 | # vegafusion 236 | pyarrow-hotfix==0.6 237 | # via datasets 238 | pydantic==2.8.2 239 | # via 240 | # anthropic 241 | # openai 242 | # qdrant-client 243 | # semantic-chunkers 244 | # semantic-router 245 | pydantic-core==2.20.1 246 | # via pydantic 247 | pygments==2.18.0 248 | # via rich 249 | pymupdf==1.24.13 250 | # via pymupdf4llm 251 | pymupdf4llm==0.0.17 252 | # via -r requirements.in 253 | pyparsing==3.1.4 254 | # via matplotlib 255 | pystemmer==2.2.0.3 256 | # via -r requirements.in 257 | python-dateutil==2.9.0.post0 258 | # via 259 | # matplotlib 260 | # pandas 261 | python-dotenv==1.0.1 262 | # via -r requirements.in 263 | pytz==2024.2 264 | # via pandas 265 | pyyaml==6.0.2 266 | # via 267 | # accelerate 268 | # datasets 269 | # huggingface-hub 270 | # peft 271 | # semantic-router 272 | # transformers 273 | qdrant-client==1.11.3 274 | # via -r requirements.in 275 | referencing==0.35.1 276 | # via 277 | # jsonschema 278 | # jsonschema-specifications 279 | regex==2023.12.25 280 | # via 281 | # semantic-chunkers 282 | # semantic-router 283 | # tiktoken 284 | # transformers 285 | requests==2.32.3 286 | # via 287 | # cohere 288 | # colpali-engine 289 | # datasets 290 | # huggingface-hub 291 | # requests-mock 292 | # tiktoken 293 | # transformers 294 | requests-mock==1.12.1 295 | # via 296 | # semantic-chunkers 297 | # semantic-router 298 | rich==13.8.1 299 | # via 300 | # -r requirements.in 301 | # rich-theme-manager 302 | rich-theme-manager==0.11.0 303 | # via -r requirements.in 304 | rpds-py==0.20.0 305 | # via 306 | # jsonschema 307 | # referencing 308 | safetensors==0.4.5 309 | # via 310 | # accelerate 311 | # peft 312 | # transformers 313 | scikit-learn==1.5.2 314 | # via 315 | # -r requirements.in 316 | # sentence-transformers 317 | scipy==1.14.1 318 | # via 319 | # bm25s 320 | # scikit-learn 321 | # sentence-transformers 322 | seaborn==0.13.2 323 | # via -r requirements.in 324 | semantic-chunkers==0.0.4 325 | # via -r requirements.in 326 | semantic-router==0.0.37 327 | # via 328 | # -r requirements.in 329 | # semantic-chunkers 330 | sentence-transformers==3.0.1 331 | # via -r requirements.in 332 | setuptools==75.1.0 333 | # via grpcio-tools 334 | six==1.16.0 335 | # via python-dateutil 336 | sniffio==1.3.1 337 | # via 338 | # anthropic 339 | # anyio 340 | # httpx 341 | # openai 342 | stamina==24.3.0 343 | # via -r requirements.in 344 | sympy==1.13.3 345 | # via torch 346 | tabulate==0.9.0 347 | # via -r requirements.in 348 | tenacity==8.5.0 349 | # via stamina 350 | threadpoolctl==3.5.0 351 | # via scikit-learn 352 | tiktoken==0.6.0 353 | # via 354 | # semantic-chunkers 355 | # semantic-router 356 | tokenizers==0.20.0 357 | # via transformers 358 | torch==2.4.1 359 | # via 360 | # accelerate 361 | # colpali-engine 362 | # peft 363 | # sentence-transformers 364 | tqdm==4.66.5 365 | # via 366 | # datasets 367 | # huggingface-hub 368 | # openai 369 | # peft 370 | # sentence-transformers 371 | # transformers 372 | transformers==4.45.1 373 | # via 374 | # -r requirements.in 375 | # colpali-engine 376 | # peft 377 | # sentence-transformers 378 | typing-extensions==4.12.2 379 | # via 380 | # altair 381 | # anthropic 382 | # huggingface-hub 383 | # openai 384 | # pydantic 385 | # pydantic-core 386 | # torch 387 | tzdata==2024.2 388 | # via pandas 389 | urllib3==2.2.3 390 | # via 391 | # cohere 392 | # qdrant-client 393 | # requests 394 | vegafusion==1.6.9 395 | # via -r requirements.in 396 | vegafusion-python-embed==1.6.9 397 | # via -r requirements.in 398 | vl-convert-python==1.7.0 399 | # via -r requirements.in 400 | xxhash==3.5.0 401 | # via datasets 402 | yarl==1.13.0 403 | # via aiohttp 404 | zipp==3.20.2 405 | # via importlib-metadata 406 | -------------------------------------------------------------------------------- /themes/dark.theme: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = dark 3 | description = Dark mode theme 4 | tags = dark 5 | inherit = True 6 | 7 | [styles] 8 | default = bright_white on black 9 | repr.attrib_name = bold #e87d3e 10 | repr.attrib_value = bright_blue 11 | repr.call = bright_yellow 12 | repr.none = dim white 13 | repr.number = bright_red 14 | repr.own = bold #e87d3e 15 | repr.str = bright_green 16 | repr.tag_name = dim cyan 17 | -------------------------------------------------------------------------------- /themes/light.theme: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = light 3 | description = Light mode theme 4 | tags = 5 | inherit = True 6 | 7 | [styles] 8 | default = #000000 on #ffffff 9 | repr.attrib_name = bold #ffff00 10 | repr.attrib_value = bold #008080 11 | repr.call = bold #ffff00 12 | repr.none = bold #808080 13 | repr.number = bold #ff6347 14 | repr.own = bold #22863a 15 | repr.str = bold #008080 16 | repr.tag_name = bold #00bfff 17 | -------------------------------------------------------------------------------- /uv.lock: -------------------------------------------------------------------------------- 1 | version = 1 2 | requires-python = ">=3.8" 3 | 4 | [[package]] 5 | name = "advanced-rag" 6 | version = "0.1.0" 7 | source = { virtual = "." } 8 | dependencies = [ 9 | { name = "ruff" }, 10 | ] 11 | 12 | [package.metadata] 13 | requires-dist = [{ name = "ruff", specifier = ">=0.6.7" }] 14 | 15 | [[package]] 16 | name = "ruff" 17 | version = "0.6.7" 18 | source = { registry = "https://pypi.org/simple" } 19 | sdist = { url = "https://files.pythonhosted.org/packages/8d/7c/3045a526c57cef4b5ec4d5d154692e31429749a49810a53e785de334c4f6/ruff-0.6.7.tar.gz", hash = "sha256:44e52129d82266fa59b587e2cd74def5637b730a69c4542525dfdecfaae38bd5", size = 3073785 } 20 | wheels = [ 21 | { url = "https://files.pythonhosted.org/packages/22/c4/1c5c636f83f905c537785016e9cdd7a36df53c025a2d07940580ecb37bcf/ruff-0.6.7-py3-none-linux_armv6l.whl", hash = "sha256:08277b217534bfdcc2e1377f7f933e1c7957453e8a79764d004e44c40db923f2", size = 10336748 }, 22 | { url = "https://files.pythonhosted.org/packages/84/d9/aa15a56be7ad796f4d7625362aff588f9fc013bbb7323a63571628a2cf2d/ruff-0.6.7-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:c6707a32e03b791f4448dc0dce24b636cbcdee4dd5607adc24e5ee73fd86c00a", size = 9958833 }, 23 | { url = "https://files.pythonhosted.org/packages/27/25/5dd1c32bfc3ad3136c8ebe84312d1bdd2e6c908ac7f60692ec009b7050a8/ruff-0.6.7-py3-none-macosx_11_0_arm64.whl", hash = "sha256:533d66b7774ef224e7cf91506a7dafcc9e8ec7c059263ec46629e54e7b1f90ab", size = 9633369 }, 24 | { url = "https://files.pythonhosted.org/packages/0e/3e/01b25484f3cb08fe6fddedf1f55f3f3c0af861a5b5f5082fbe60ab4b2596/ruff-0.6.7-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:17a86aac6f915932d259f7bec79173e356165518859f94649d8c50b81ff087e9", size = 10637415 }, 25 | { url = "https://files.pythonhosted.org/packages/8a/c9/5bb9b849e4777e0f961de43edf95d2af0ab34999a5feee957be096887876/ruff-0.6.7-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b3f8822defd260ae2460ea3832b24d37d203c3577f48b055590a426a722d50ef", size = 10097389 }, 26 | { url = "https://files.pythonhosted.org/packages/52/cf/e08f1c290c7d848ddfb2ae811f24f445c18e1d3e50e01c38ffa7f5a50494/ruff-0.6.7-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9ba4efe5c6dbbb58be58dd83feedb83b5e95c00091bf09987b4baf510fee5c99", size = 10951440 }, 27 | { url = "https://files.pythonhosted.org/packages/a2/2d/ca8aa0da5841913c302d8034c6de0ce56c401c685184d8dd23cfdd0003f9/ruff-0.6.7-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:525201b77f94d2b54868f0cbe5edc018e64c22563da6c5c2e5c107a4e85c1c0d", size = 11708900 }, 28 | { url = "https://files.pythonhosted.org/packages/89/fc/9a83c57baee977c82392e19a328b52cebdaf61601af3d99498e278ef5104/ruff-0.6.7-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8854450839f339e1049fdbe15d875384242b8e85d5c6947bb2faad33c651020b", size = 11258892 }, 29 | { url = "https://files.pythonhosted.org/packages/d3/a3/254cc7afef702c68ae9079290c2a1477ae0e81478589baf745026d8a4eb5/ruff-0.6.7-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2f0b62056246234d59cbf2ea66e84812dc9ec4540518e37553513392c171cb18", size = 12367932 }, 30 | { url = "https://files.pythonhosted.org/packages/9f/55/53f10c1bd8c3b2ae79aed18e62b22c6346f9296aa0ec80489b8442bd06a9/ruff-0.6.7-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b1462fa56c832dc0cea5b4041cfc9c97813505d11cce74ebc6d1aae068de36b", size = 10838629 }, 31 | { url = "https://files.pythonhosted.org/packages/84/72/fb335c2b25432c63d15383ecbd7bfc1915e68cdf8d086a08042052144255/ruff-0.6.7-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:02b083770e4cdb1495ed313f5694c62808e71764ec6ee5db84eedd82fd32d8f5", size = 10648824 }, 32 | { url = "https://files.pythonhosted.org/packages/92/a8/d57e135a8ad99b6a0c6e2a5c590bcacdd57f44340174f4409c3893368610/ruff-0.6.7-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:0c05fd37013de36dfa883a3854fae57b3113aaa8abf5dea79202675991d48624", size = 10174368 }, 33 | { url = "https://files.pythonhosted.org/packages/a7/6f/1a30a6e81dcf2fa9ff3f7011eb87fe76c12a3c6bba74db6a1977d763de1f/ruff-0.6.7-py3-none-musllinux_1_2_i686.whl", hash = "sha256:f49c9caa28d9bbfac4a637ae10327b3db00f47d038f3fbb2195c4d682e925b14", size = 10514383 }, 34 | { url = "https://files.pythonhosted.org/packages/0b/25/df6f2575bc9fe43a6dedfd8dee12896f09a94303e2c828d5f85856bb69a0/ruff-0.6.7-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:a0e1655868164e114ba43a908fd2d64a271a23660195017c17691fb6355d59bb", size = 10902340 }, 35 | { url = "https://files.pythonhosted.org/packages/68/62/f2c1031e2fb7b94f9bf0603744e73db4ef90081b0eb1b9639a6feefd52ea/ruff-0.6.7-py3-none-win32.whl", hash = "sha256:a939ca435b49f6966a7dd64b765c9df16f1faed0ca3b6f16acdf7731969deb35", size = 8448033 }, 36 | { url = "https://files.pythonhosted.org/packages/97/80/193d1604a3f7d75eb1b2a7ce6bf0fdbdbc136889a65caacea6ffb29501b1/ruff-0.6.7-py3-none-win_amd64.whl", hash = "sha256:590445eec5653f36248584579c06252ad2e110a5d1f32db5420de35fb0e1c977", size = 9273543 }, 37 | { url = "https://files.pythonhosted.org/packages/8e/a8/4abb5a9f58f51e4b1ea386be5ab2e547035bc1ee57200d1eca2f8909a33e/ruff-0.6.7-py3-none-win_arm64.whl", hash = "sha256:b28f0d5e2f771c1fe3c7a45d3f53916fc74a480698c4b5731f0bea61e52137c8", size = 8618044 }, 38 | ] 39 | --------------------------------------------------------------------------------