├── .gitignore ├── README.md ├── Semi_structured_and_multi_modal_RAG (1).ipynb └── test.pdf /.gitignore: -------------------------------------------------------------------------------- 1 | .env 2 | .venv -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Multimodal Retrieval Augmented Generation with LLMs 2 | 3 | This project harnesses the power of OpenAI's language models (LMs) like GPT-3.5 and GPT-4 for multimodal retrieval and augmented generation. It focuses on analyzing, summarizing, and indexing diverse data types - text, tables, and images. Summaries are stored in a Chroma vectorstore and an InMemoryStore, using OpenAIEmbeddings for sophisticated indexing. The system is designed for advanced information synthesis across formats, priming it for future integrations with multimodal LLMs, including GPT4-V and CLIP, to revolutionize AI-driven content processing and creation. 4 | -------------------------------------------------------------------------------- /Semi_structured_and_multi_modal_RAG (1).ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "140580ef-5db0-43cc-a524-9c39e04d4df0", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "! pip install langchain unstructured[all-docs] pydantic lxml openai chromadb tiktoken" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "id": "74b56bde-1ba0-4525-a11d-cab02c5659e4", 16 | "metadata": {}, 17 | "source": [ 18 | "## Data Loading\n", 19 | "\n", 20 | "### Partition PDF tables, text, and images\n", 21 | "\n", 22 | "* Use [Unstructured](https://unstructured-io.github.io/unstructured/) to partition elements" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "id": "e98bdeb7-eb77-42e6-a3a5-c3f27a1838d5", 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "from typing import Any\n", 33 | "import os\n", 34 | "from unstructured.partition.pdf import partition_pdf\n", 35 | "import pytesseract\n", 36 | "import os\n", 37 | "\n", 38 | "pytesseract.pytesseract.tesseract_cmd = r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'\n", 39 | "\n", 40 | "input_path = os.getcwd()\n", 41 | "output_path = os.path.join(os.getcwd(), \"output\")\n", 42 | "\n", 43 | "# Get elements\n", 44 | "raw_pdf_elements = partition_pdf(\n", 45 | " filename=os.path.join(input_path, \"test.pdf\"),\n", 46 | " extract_images_in_pdf=True,\n", 47 | " infer_table_structure=True,\n", 48 | " chunking_strategy=\"by_title\",\n", 49 | " max_characters=4000,\n", 50 | " new_after_n_chars=3800,\n", 51 | " combine_text_under_n_chars=2000,\n", 52 | " image_output_dir_path=output_path,\n", 53 | ")" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "id": "5f660305-e165-4b6c-ada3-a67a422defb5", 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "import base64\n", 64 | "\n", 65 | "text_elements = []\n", 66 | "table_elements = []\n", 67 | "image_elements = []\n", 68 | "\n", 69 | "# Function to encode images\n", 70 | "def encode_image(image_path):\n", 71 | " with open(image_path, \"rb\") as image_file:\n", 72 | " return base64.b64encode(image_file.read()).decode('utf-8')\n", 73 | "\n", 74 | "for element in raw_pdf_elements:\n", 75 | " if 'CompositeElement' in str(type(element)):\n", 76 | " text_elements.append(element)\n", 77 | " elif 'Table' in str(type(element)):\n", 78 | " table_elements.append(element)\n", 79 | "\n", 80 | "table_elements = [i.text for i in table_elements]\n", 81 | "text_elements = [i.text for i in text_elements]\n", 82 | "\n", 83 | "# Tables\n", 84 | "print(len(table_elements))\n", 85 | "\n", 86 | "# Text\n", 87 | "print(len(text_elements))" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": null, 93 | "metadata": {}, 94 | "outputs": [], 95 | "source": [ 96 | "for image_file in os.listdir(output_path):\n", 97 | " if image_file.endswith(('.png', '.jpg', '.jpeg')):\n", 98 | " image_path = os.path.join(output_path, image_file)\n", 99 | " encoded_image = encode_image(image_path)\n", 100 | " image_elements.append(encoded_image)\n", 101 | "print(len(image_elements))" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "from langchain.chat_models import ChatOpenAI\n", 111 | "from langchain.schema.messages import HumanMessage, AIMessage\n", 112 | "\n", 113 | "chain_gpt_35 = ChatOpenAI(model=\"gpt-3.5-turbo\", max_tokens=1024)\n", 114 | "chain_gpt_4_vision = ChatOpenAI(model=\"gpt-4-vision-preview\", max_tokens=1024)\n", 115 | "\n", 116 | "# Function for text summaries\n", 117 | "def summarize_text(text_element):\n", 118 | " prompt = f\"Summarize the following text:\\n\\n{text_element}\\n\\nSummary:\"\n", 119 | " response = chain_gpt_35.invoke([HumanMessage(content=prompt)])\n", 120 | " return response.content\n", 121 | "\n", 122 | "# Function for table summaries\n", 123 | "def summarize_table(table_element):\n", 124 | " prompt = f\"Summarize the following table:\\n\\n{table_element}\\n\\nSummary:\"\n", 125 | " response = chain_gpt_35.invoke([HumanMessage(content=prompt)])\n", 126 | " return response.content\n", 127 | "\n", 128 | "# Function for image summaries\n", 129 | "def summarize_image(encoded_image):\n", 130 | " prompt = [\n", 131 | " AIMessage(content=\"You are a bot that is good at analyzing images.\"),\n", 132 | " HumanMessage(content=[\n", 133 | " {\"type\": \"text\", \"text\": \"Describe the contents of this image.\"},\n", 134 | " {\n", 135 | " \"type\": \"image_url\",\n", 136 | " \"image_url\": {\n", 137 | " \"url\": f\"data:image/jpeg;base64,{encoded_image}\"\n", 138 | " },\n", 139 | " },\n", 140 | " ])\n", 141 | " ]\n", 142 | " response = chain_gpt_4_vision.invoke(prompt)\n", 143 | " return response.content" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "# Processing table elements with feedback and sleep\n", 153 | "table_summaries = []\n", 154 | "for i, te in enumerate(table_elements[0:2]):\n", 155 | " summary = summarize_table(te)\n", 156 | " table_summaries.append(summary)\n", 157 | " print(f\"{i + 1}th element of tables processed.\")" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": null, 163 | "metadata": {}, 164 | "outputs": [], 165 | "source": [ 166 | "# Processing text elements with feedback and sleep\n", 167 | "text_summaries = []\n", 168 | "for i, te in enumerate(text_elements[0:2]):\n", 169 | " summary = summarize_text(te)\n", 170 | " text_summaries.append(summary)\n", 171 | " print(f\"{i + 1}th element of texts processed.\")" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": null, 177 | "metadata": {}, 178 | "outputs": [], 179 | "source": [ 180 | "# Processing image elements with feedback and sleep\n", 181 | "image_summaries = []\n", 182 | "for i, ie in enumerate(image_elements[0:2]):\n", 183 | " summary = summarize_image(ie)\n", 184 | " image_summaries.append(summary)\n", 185 | " print(f\"{i + 1}th element of images processed.\")" 186 | ] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "id": "0aa7f52f-bf5c-4ba4-af72-b2ccba59a4cf", 191 | "metadata": {}, 192 | "source": [ 193 | "## Multi-vector retriever\n", 194 | "\n", 195 | "Use [multi-vector-retriever](https://python.langchain.com/docs/modules/data_connection/retrievers/multi_vector#summary).\n", 196 | "\n", 197 | "Summaries are used to retrieve raw tables and / or raw chunks of text." 198 | ] 199 | }, 200 | { 201 | "cell_type": "markdown", 202 | "id": "67b030d4-2ac5-41b6-9245-fc3ba5771d87", 203 | "metadata": {}, 204 | "source": [ 205 | "### Add to vectorstore\n", 206 | "\n", 207 | "Use [Multi Vector Retriever](https://python.langchain.com/docs/modules/data_connection/retrievers/multi_vector#summary) with summaries." 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "id": "d643cc61-827d-4f3c-8242-7a7c8291ed8a", 214 | "metadata": {}, 215 | "outputs": [], 216 | "source": [ 217 | "import uuid\n", 218 | "\n", 219 | "from langchain.embeddings import OpenAIEmbeddings\n", 220 | "from langchain.retrievers.multi_vector import MultiVectorRetriever\n", 221 | "from langchain.schema.document import Document\n", 222 | "from langchain.storage import InMemoryStore\n", 223 | "from langchain.vectorstores import Chroma\n", 224 | "\n", 225 | "# Initialize the vector store and storage layer\n", 226 | "vectorstore = Chroma(collection_name=\"summaries\", embedding_function=OpenAIEmbeddings())\n", 227 | "store = InMemoryStore()\n", 228 | "id_key = \"doc_id\"\n", 229 | "\n", 230 | "# Initialize the retriever\n", 231 | "retriever = MultiVectorRetriever(vectorstore=vectorstore, docstore=store, id_key=id_key)\n", 232 | "\n", 233 | "# Function to add documents to the retriever\n", 234 | "def add_documents_to_retriever(summaries, original_contents):\n", 235 | " doc_ids = [str(uuid.uuid4()) for _ in summaries]\n", 236 | " summary_docs = [\n", 237 | " Document(page_content=s, metadata={id_key: doc_ids[i]})\n", 238 | " for i, s in enumerate(summaries)\n", 239 | " ]\n", 240 | " retriever.vectorstore.add_documents(summary_docs)\n", 241 | " retriever.docstore.mset(list(zip(doc_ids, original_contents)))\n" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": null, 247 | "metadata": {}, 248 | "outputs": [], 249 | "source": [ 250 | "# Add text summaries\n", 251 | "add_documents_to_retriever(text_summaries, text_elements)\n", 252 | "\n", 253 | "# Add table summaries\n", 254 | "add_documents_to_retriever(table_summaries, table_elements)\n", 255 | "\n", 256 | "# Add image summaries\n", 257 | "add_documents_to_retriever(image_summaries, image_summaries) # hopefully real images soon" 258 | ] 259 | }, 260 | { 261 | "cell_type": "markdown", 262 | "id": "4b45fb81-46b1-426e-aa2c-01aed4eac700", 263 | "metadata": {}, 264 | "source": [ 265 | "# Table retrieval\n", 266 | "\n", 267 | "The most complex table in the paper:" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": null, 273 | "id": "1bea75fe-85af-4955-a80c-6e0b44a8e215", 274 | "metadata": {}, 275 | "outputs": [], 276 | "source": [ 277 | "# We can retrieve this table\n", 278 | "retriever.get_relevant_documents(\n", 279 | " \"What do you see on the images in the database?\"\n", 280 | ")" 281 | ] 282 | }, 283 | { 284 | "cell_type": "markdown", 285 | "id": "6fde6f17-d244-4270-b759-68e1858d399f", 286 | "metadata": {}, 287 | "source": [ 288 | "We can retrieve this image summary:" 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": null, 294 | "id": "771a47fa-1267-4db8-a6ae-5fde48bbc069", 295 | "metadata": {}, 296 | "outputs": [], 297 | "source": [ 298 | "from langchain.schema.runnable import RunnablePassthrough\n", 299 | "from langchain.prompts import ChatPromptTemplate\n", 300 | "from langchain.schema.output_parser import StrOutputParser\n", 301 | "\n", 302 | "template = \"\"\"Answer the question based only on the following context, which can include text, images and tables:\n", 303 | "{context}\n", 304 | "Question: {question}\n", 305 | "\"\"\"\n", 306 | "prompt = ChatPromptTemplate.from_template(template)\n", 307 | "\n", 308 | "model = ChatOpenAI(temperature=0, model=\"gpt-3.5-turbo\")\n", 309 | "\n", 310 | "chain = (\n", 311 | " {\"context\": retriever, \"question\": RunnablePassthrough()}\n", 312 | " | prompt\n", 313 | " | model\n", 314 | " | StrOutputParser()\n", 315 | ")" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": null, 321 | "id": "ea8414a8-65ee-4e11-8154-029b454f46af", 322 | "metadata": {}, 323 | "outputs": [], 324 | "source": [ 325 | "chain.invoke(\n", 326 | " \"What do you see on the images in the database?\"\n", 327 | ")" 328 | ] 329 | } 330 | ], 331 | "metadata": { 332 | "kernelspec": { 333 | "display_name": "Python 3 (ipykernel)", 334 | "language": "python", 335 | "name": "python3" 336 | }, 337 | "language_info": { 338 | "codemirror_mode": { 339 | "name": "ipython", 340 | "version": 3 341 | }, 342 | "file_extension": ".py", 343 | "mimetype": "text/x-python", 344 | "name": "python", 345 | "nbconvert_exporter": "python", 346 | "pygments_lexer": "ipython3", 347 | "version": "3.11.0" 348 | } 349 | }, 350 | "nbformat": 4, 351 | "nbformat_minor": 5 352 | } 353 | -------------------------------------------------------------------------------- /test.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Coding-Crashkurse/Multimodal-RAG-With-OpenAI/33269c382e716c8c595d4bbd7c20401859166908/test.pdf --------------------------------------------------------------------------------