├── .gitignore
├── README.md
├── Semi_structured_and_multi_modal_RAG (1).ipynb
└── test.pdf


/.gitignore:
--------------------------------------------------------------------------------
1 | .env
2 | .venv


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Multimodal Retrieval Augmented Generation with LLMs
2 | 
3 | This project harnesses the power of OpenAI's language models (LMs) like GPT-3.5 and GPT-4 for multimodal retrieval and augmented generation. It focuses on analyzing, summarizing, and indexing diverse data types - text, tables, and images. Summaries are stored in a Chroma vectorstore and an InMemoryStore, using OpenAIEmbeddings for sophisticated indexing. The system is designed for advanced information synthesis across formats, priming it for future integrations with multimodal LLMs, including GPT4-V and CLIP, to revolutionize AI-driven content processing and creation.
4 | 


--------------------------------------------------------------------------------
/Semi_structured_and_multi_modal_RAG (1).ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "id": "140580ef-5db0-43cc-a524-9c39e04d4df0",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "! pip install langchain unstructured[all-docs] pydantic lxml openai chromadb tiktoken"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "markdown",
 15 |    "id": "74b56bde-1ba0-4525-a11d-cab02c5659e4",
 16 |    "metadata": {},
 17 |    "source": [
 18 |     "## Data Loading\n",
 19 |     "\n",
 20 |     "### Partition PDF tables, text, and images\n",
 21 |     "\n",
 22 |     "* Use [Unstructured](https://unstructured-io.github.io/unstructured/) to partition elements"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": null,
 28 |    "id": "e98bdeb7-eb77-42e6-a3a5-c3f27a1838d5",
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "from typing import Any\n",
 33 |     "import os\n",
 34 |     "from unstructured.partition.pdf import partition_pdf\n",
 35 |     "import pytesseract\n",
 36 |     "import os\n",
 37 |     "\n",
 38 |     "pytesseract.pytesseract.tesseract_cmd = r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'\n",
 39 |     "\n",
 40 |     "input_path = os.getcwd()\n",
 41 |     "output_path = os.path.join(os.getcwd(), \"output\")\n",
 42 |     "\n",
 43 |     "# Get elements\n",
 44 |     "raw_pdf_elements = partition_pdf(\n",
 45 |     "    filename=os.path.join(input_path, \"test.pdf\"),\n",
 46 |     "    extract_images_in_pdf=True,\n",
 47 |     "    infer_table_structure=True,\n",
 48 |     "    chunking_strategy=\"by_title\",\n",
 49 |     "    max_characters=4000,\n",
 50 |     "    new_after_n_chars=3800,\n",
 51 |     "    combine_text_under_n_chars=2000,\n",
 52 |     "    image_output_dir_path=output_path,\n",
 53 |     ")"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": null,
 59 |    "id": "5f660305-e165-4b6c-ada3-a67a422defb5",
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "import base64\n",
 64 |     "\n",
 65 |     "text_elements = []\n",
 66 |     "table_elements = []\n",
 67 |     "image_elements = []\n",
 68 |     "\n",
 69 |     "# Function to encode images\n",
 70 |     "def encode_image(image_path):\n",
 71 |     "    with open(image_path, \"rb\") as image_file:\n",
 72 |     "        return base64.b64encode(image_file.read()).decode('utf-8')\n",
 73 |     "\n",
 74 |     "for element in raw_pdf_elements:\n",
 75 |     "    if 'CompositeElement' in str(type(element)):\n",
 76 |     "        text_elements.append(element)\n",
 77 |     "    elif 'Table' in str(type(element)):\n",
 78 |     "        table_elements.append(element)\n",
 79 |     "\n",
 80 |     "table_elements = [i.text for i in table_elements]\n",
 81 |     "text_elements = [i.text for i in text_elements]\n",
 82 |     "\n",
 83 |     "# Tables\n",
 84 |     "print(len(table_elements))\n",
 85 |     "\n",
 86 |     "# Text\n",
 87 |     "print(len(text_elements))"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": null,
 93 |    "metadata": {},
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "for image_file in os.listdir(output_path):\n",
 97 |     "    if image_file.endswith(('.png', '.jpg', '.jpeg')):\n",
 98 |     "        image_path = os.path.join(output_path, image_file)\n",
 99 |     "        encoded_image = encode_image(image_path)\n",
100 |     "        image_elements.append(encoded_image)\n",
101 |     "print(len(image_elements))"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": null,
107 |    "metadata": {},
108 |    "outputs": [],
109 |    "source": [
110 |     "from langchain.chat_models import ChatOpenAI\n",
111 |     "from langchain.schema.messages import HumanMessage, AIMessage\n",
112 |     "\n",
113 |     "chain_gpt_35 = ChatOpenAI(model=\"gpt-3.5-turbo\", max_tokens=1024)\n",
114 |     "chain_gpt_4_vision = ChatOpenAI(model=\"gpt-4-vision-preview\", max_tokens=1024)\n",
115 |     "\n",
116 |     "# Function for text summaries\n",
117 |     "def summarize_text(text_element):\n",
118 |     "    prompt = f\"Summarize the following text:\\n\\n{text_element}\\n\\nSummary:\"\n",
119 |     "    response = chain_gpt_35.invoke([HumanMessage(content=prompt)])\n",
120 |     "    return response.content\n",
121 |     "\n",
122 |     "# Function for table summaries\n",
123 |     "def summarize_table(table_element):\n",
124 |     "    prompt = f\"Summarize the following table:\\n\\n{table_element}\\n\\nSummary:\"\n",
125 |     "    response = chain_gpt_35.invoke([HumanMessage(content=prompt)])\n",
126 |     "    return response.content\n",
127 |     "\n",
128 |     "# Function for image summaries\n",
129 |     "def summarize_image(encoded_image):\n",
130 |     "    prompt = [\n",
131 |     "        AIMessage(content=\"You are a bot that is good at analyzing images.\"),\n",
132 |     "        HumanMessage(content=[\n",
133 |     "            {\"type\": \"text\", \"text\": \"Describe the contents of this image.\"},\n",
134 |     "            {\n",
135 |     "                \"type\": \"image_url\",\n",
136 |     "                \"image_url\": {\n",
137 |     "                    \"url\": f\"data:image/jpeg;base64,{encoded_image}\"\n",
138 |     "                },\n",
139 |     "            },\n",
140 |     "        ])\n",
141 |     "    ]\n",
142 |     "    response = chain_gpt_4_vision.invoke(prompt)\n",
143 |     "    return response.content"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": null,
149 |    "metadata": {},
150 |    "outputs": [],
151 |    "source": [
152 |     "# Processing table elements with feedback and sleep\n",
153 |     "table_summaries = []\n",
154 |     "for i, te in enumerate(table_elements[0:2]):\n",
155 |     "    summary = summarize_table(te)\n",
156 |     "    table_summaries.append(summary)\n",
157 |     "    print(f\"{i + 1}th element of tables processed.\")"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": null,
163 |    "metadata": {},
164 |    "outputs": [],
165 |    "source": [
166 |     "# Processing text elements with feedback and sleep\n",
167 |     "text_summaries = []\n",
168 |     "for i, te in enumerate(text_elements[0:2]):\n",
169 |     "    summary = summarize_text(te)\n",
170 |     "    text_summaries.append(summary)\n",
171 |     "    print(f\"{i + 1}th element of texts processed.\")"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "code",
176 |    "execution_count": null,
177 |    "metadata": {},
178 |    "outputs": [],
179 |    "source": [
180 |     "# Processing image elements with feedback and sleep\n",
181 |     "image_summaries = []\n",
182 |     "for i, ie in enumerate(image_elements[0:2]):\n",
183 |     "    summary = summarize_image(ie)\n",
184 |     "    image_summaries.append(summary)\n",
185 |     "    print(f\"{i + 1}th element of images processed.\")"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "markdown",
190 |    "id": "0aa7f52f-bf5c-4ba4-af72-b2ccba59a4cf",
191 |    "metadata": {},
192 |    "source": [
193 |     "## Multi-vector retriever\n",
194 |     "\n",
195 |     "Use [multi-vector-retriever](https://python.langchain.com/docs/modules/data_connection/retrievers/multi_vector#summary).\n",
196 |     "\n",
197 |     "Summaries are used to retrieve raw tables and / or raw chunks of text."
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "markdown",
202 |    "id": "67b030d4-2ac5-41b6-9245-fc3ba5771d87",
203 |    "metadata": {},
204 |    "source": [
205 |     "### Add to vectorstore\n",
206 |     "\n",
207 |     "Use [Multi Vector Retriever](https://python.langchain.com/docs/modules/data_connection/retrievers/multi_vector#summary) with summaries."
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": null,
213 |    "id": "d643cc61-827d-4f3c-8242-7a7c8291ed8a",
214 |    "metadata": {},
215 |    "outputs": [],
216 |    "source": [
217 |     "import uuid\n",
218 |     "\n",
219 |     "from langchain.embeddings import OpenAIEmbeddings\n",
220 |     "from langchain.retrievers.multi_vector import MultiVectorRetriever\n",
221 |     "from langchain.schema.document import Document\n",
222 |     "from langchain.storage import InMemoryStore\n",
223 |     "from langchain.vectorstores import Chroma\n",
224 |     "\n",
225 |     "# Initialize the vector store and storage layer\n",
226 |     "vectorstore = Chroma(collection_name=\"summaries\", embedding_function=OpenAIEmbeddings())\n",
227 |     "store = InMemoryStore()\n",
228 |     "id_key = \"doc_id\"\n",
229 |     "\n",
230 |     "# Initialize the retriever\n",
231 |     "retriever = MultiVectorRetriever(vectorstore=vectorstore, docstore=store, id_key=id_key)\n",
232 |     "\n",
233 |     "# Function to add documents to the retriever\n",
234 |     "def add_documents_to_retriever(summaries, original_contents):\n",
235 |     "    doc_ids = [str(uuid.uuid4()) for _ in summaries]\n",
236 |     "    summary_docs = [\n",
237 |     "        Document(page_content=s, metadata={id_key: doc_ids[i]})\n",
238 |     "        for i, s in enumerate(summaries)\n",
239 |     "    ]\n",
240 |     "    retriever.vectorstore.add_documents(summary_docs)\n",
241 |     "    retriever.docstore.mset(list(zip(doc_ids, original_contents)))\n"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "code",
246 |    "execution_count": null,
247 |    "metadata": {},
248 |    "outputs": [],
249 |    "source": [
250 |     "# Add text summaries\n",
251 |     "add_documents_to_retriever(text_summaries, text_elements)\n",
252 |     "\n",
253 |     "# Add table summaries\n",
254 |     "add_documents_to_retriever(table_summaries, table_elements)\n",
255 |     "\n",
256 |     "# Add image summaries\n",
257 |     "add_documents_to_retriever(image_summaries, image_summaries) # hopefully real images soon"
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "markdown",
262 |    "id": "4b45fb81-46b1-426e-aa2c-01aed4eac700",
263 |    "metadata": {},
264 |    "source": [
265 |     "# Table retrieval\n",
266 |     "\n",
267 |     "The most complex table in the paper:"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "code",
272 |    "execution_count": null,
273 |    "id": "1bea75fe-85af-4955-a80c-6e0b44a8e215",
274 |    "metadata": {},
275 |    "outputs": [],
276 |    "source": [
277 |     "# We can retrieve this table\n",
278 |     "retriever.get_relevant_documents(\n",
279 |     "    \"What do you see on the images in the database?\"\n",
280 |     ")"
281 |    ]
282 |   },
283 |   {
284 |    "cell_type": "markdown",
285 |    "id": "6fde6f17-d244-4270-b759-68e1858d399f",
286 |    "metadata": {},
287 |    "source": [
288 |     "We can retrieve this image summary:"
289 |    ]
290 |   },
291 |   {
292 |    "cell_type": "code",
293 |    "execution_count": null,
294 |    "id": "771a47fa-1267-4db8-a6ae-5fde48bbc069",
295 |    "metadata": {},
296 |    "outputs": [],
297 |    "source": [
298 |     "from langchain.schema.runnable import RunnablePassthrough\n",
299 |     "from langchain.prompts import ChatPromptTemplate\n",
300 |     "from langchain.schema.output_parser import StrOutputParser\n",
301 |     "\n",
302 |     "template = \"\"\"Answer the question based only on the following context, which can include text, images and tables:\n",
303 |     "{context}\n",
304 |     "Question: {question}\n",
305 |     "\"\"\"\n",
306 |     "prompt = ChatPromptTemplate.from_template(template)\n",
307 |     "\n",
308 |     "model = ChatOpenAI(temperature=0, model=\"gpt-3.5-turbo\")\n",
309 |     "\n",
310 |     "chain = (\n",
311 |     "    {\"context\": retriever, \"question\": RunnablePassthrough()}\n",
312 |     "    | prompt\n",
313 |     "    | model\n",
314 |     "    | StrOutputParser()\n",
315 |     ")"
316 |    ]
317 |   },
318 |   {
319 |    "cell_type": "code",
320 |    "execution_count": null,
321 |    "id": "ea8414a8-65ee-4e11-8154-029b454f46af",
322 |    "metadata": {},
323 |    "outputs": [],
324 |    "source": [
325 |     "chain.invoke(\n",
326 |     "     \"What do you see on the images in the database?\"\n",
327 |     ")"
328 |    ]
329 |   }
330 |  ],
331 |  "metadata": {
332 |   "kernelspec": {
333 |    "display_name": "Python 3 (ipykernel)",
334 |    "language": "python",
335 |    "name": "python3"
336 |   },
337 |   "language_info": {
338 |    "codemirror_mode": {
339 |     "name": "ipython",
340 |     "version": 3
341 |    },
342 |    "file_extension": ".py",
343 |    "mimetype": "text/x-python",
344 |    "name": "python",
345 |    "nbconvert_exporter": "python",
346 |    "pygments_lexer": "ipython3",
347 |    "version": "3.11.0"
348 |   }
349 |  },
350 |  "nbformat": 4,
351 |  "nbformat_minor": 5
352 | }
353 | 


--------------------------------------------------------------------------------
/test.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Coding-Crashkurse/Multimodal-RAG-With-OpenAI/33269c382e716c8c595d4bbd7c20401859166908/test.pdf


--------------------------------------------------------------------------------