├── Capabilities of Langchain.ipynb ├── README.md ├── agent_app.py ├── building_chatbot.ipynb ├── chatbot_demo.py ├── chatbot_with_memory.ipynb ├── chatpdf.ipynb └── chatpdf.py /Capabilities of Langchain.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "cb65178f", 6 | "metadata": {}, 7 | "source": [ 8 | "## Capabilities of Langchain\n", 9 | "\n", 10 | "Langchain is an AI framework that enables building applications integrated with language models
\n", 11 | "In this notebook, we shall learn about some of the core functionalities of LangChain and how you can use them
" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "id": "e83d8d28", 17 | "metadata": {}, 18 | "source": [ 19 | " Capabilities \n", 20 | "\n", 21 | "1. Modelling interface: enables easy & streamlined access to language models from OpenAI, HuggingFace, etc\n", 22 | "\n", 23 | "2. Prompts: prompt management, prompt optimization & serialization. Easily create prompt templates\n", 24 | "\n", 25 | "3. Chains: Allows combining multiple tasks with or without language models to executed together\n", 26 | "\n", 27 | "4. Memory:\n", 28 | "\n", 29 | "5. Indexes: Combine model with your own custom data. It provides data loaders and vector stores\n", 30 | "\n", 31 | "6. Agent & Tools: " 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 10, 37 | "id": "4240a8df", 38 | "metadata": { 39 | "collapsed": true 40 | }, 41 | "outputs": [ 42 | { 43 | "name": "stdout", 44 | "output_type": "stream", 45 | "text": [ 46 | "Collecting openai\n", 47 | " Downloading openai-0.27.8-py3-none-any.whl (73 kB)\n", 48 | " ---------------------------------------- 0.0/73.6 kB ? eta -:--:--\n", 49 | " ---------------------------------------- 73.6/73.6 kB 4.0 MB/s eta 0:00:00\n", 50 | "Requirement already satisfied: tqdm in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from openai) (4.65.0)\n", 51 | "Requirement already satisfied: aiohttp in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from openai) (3.8.3)\n", 52 | "Requirement already satisfied: requests>=2.20 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from openai) (2.29.0)\n", 53 | "Requirement already satisfied: idna<4,>=2.5 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from requests>=2.20->openai) (3.4)\n", 54 | "Requirement already satisfied: charset-normalizer<4,>=2 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from requests>=2.20->openai) (2.0.4)\n", 55 | "Requirement already satisfied: urllib3<1.27,>=1.21.1 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from requests>=2.20->openai) (1.26.16)\n", 56 | "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from requests>=2.20->openai) (2023.5.7)\n", 57 | "Requirement already satisfied: attrs>=17.3.0 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from aiohttp->openai) (22.1.0)\n", 58 | "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from aiohttp->openai) (4.0.2)\n", 59 | "Requirement already satisfied: frozenlist>=1.1.1 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from aiohttp->openai) (1.3.3)\n", 60 | "Requirement already satisfied: aiosignal>=1.1.2 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from aiohttp->openai) (1.2.0)\n", 61 | "Requirement already satisfied: multidict<7.0,>=4.5 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from aiohttp->openai) (6.0.2)\n", 62 | "Requirement already satisfied: yarl<2.0,>=1.0 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from aiohttp->openai) (1.8.1)\n", 63 | "Requirement already satisfied: colorama in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from tqdm->openai) (0.4.6)\n", 64 | "Installing collected packages: openai\n", 65 | "Successfully installed openai-0.27.8\n" 66 | ] 67 | } 68 | ], 69 | "source": [ 70 | "!pip install openai" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 11, 76 | "id": "9c8cf37b", 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "import langchain\n", 81 | "import os\n", 82 | "#import openai" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "id": "2edbbde0", 88 | "metadata": {}, 89 | "source": [ 90 | "#### API KEY" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 21, 96 | "id": "02b03d6c", 97 | "metadata": {}, 98 | "outputs": [ 99 | { 100 | "name": "stdout", 101 | "output_type": "stream", 102 | "text": [ 103 | "········\n" 104 | ] 105 | } 106 | ], 107 | "source": [ 108 | "from getpass import getpass\n", 109 | "HUGGINGFACEHUB_API_TOKEN = getpass()\n", 110 | "#os.environ['OPENAI_API_KEY'] = '...'\n", 111 | "os.environ['HUGGINGFACEHUB_API_TOKEN'] = HUGGINGFACEHUB_API_TOKEN" 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "id": "9a843191", 117 | "metadata": {}, 118 | "source": [ 119 | "### 1. Modelling interface" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 17, 125 | "id": "8e3f516c", 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [ 129 | "#openai.api_key = os.environ['OPENAI_API_KEY']\n" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 27, 135 | "id": "54ac67f9", 136 | "metadata": {}, 137 | "outputs": [ 138 | { 139 | "name": "stdout", 140 | "output_type": "stream", 141 | "text": [ 142 | "\n", 143 | "import torch.nn.functional as F\n", 144 | "from\n" 145 | ] 146 | } 147 | ], 148 | "source": [ 149 | "#from langchain.llms import OpenAI\n", 150 | "from langchain import HuggingFaceHub\n", 151 | "#llm = OpenAI(model_name=\"text-davinci-003\", )\n", 152 | "\n", 153 | "llm = HuggingFaceHub(repo_id='stabilityai/stablecode-completion-alpha-3b-4k')\n", 154 | "prompt = \"import torch\\nimport torch.nn as nn\"\n", 155 | "completion = llm(prompt)\n", 156 | "\n", 157 | "print(completion)" 158 | ] 159 | }, 160 | { 161 | "cell_type": "markdown", 162 | "id": "ff1ff30e", 163 | "metadata": {}, 164 | "source": [ 165 | "You could use multiple LLM interfaces such as OpenAI, HuggingFaceHub etc" 166 | ] 167 | }, 168 | { 169 | "cell_type": "markdown", 170 | "id": "075d2975", 171 | "metadata": {}, 172 | "source": [ 173 | "### 2. Prompts" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 30, 179 | "id": "680f35a9", 180 | "metadata": {}, 181 | "outputs": [ 182 | { 183 | "data": { 184 | "text/plain": [ 185 | "'Name a market leader that makes computers'" 186 | ] 187 | }, 188 | "execution_count": 30, 189 | "metadata": {}, 190 | "output_type": "execute_result" 191 | } 192 | ], 193 | "source": [ 194 | "from langchain import PromptTemplate\n", 195 | "\n", 196 | "template = \"Name a market leader that makes {product}\"\n", 197 | "\n", 198 | "prompt = PromptTemplate(\n", 199 | " input_variables=['product'],\n", 200 | " template = template\n", 201 | ")\n", 202 | "\n", 203 | "prompt.format(product=\"computers\")" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": 39, 209 | "id": "d7284223", 210 | "metadata": {}, 211 | "outputs": [ 212 | { 213 | "data": { 214 | "text/plain": [ 215 | "'Write an email to your HR stating about your reason of absence from office for 5 days.\\nThe email should not exceed 50 words and must contain a subject'" 216 | ] 217 | }, 218 | "execution_count": 39, 219 | "metadata": {}, 220 | "output_type": "execute_result" 221 | } 222 | ], 223 | "source": [ 224 | "email_template = \"\"\"Write an email to your {receiver_name} stating about your reason of absence from office for {x} days.\n", 225 | "The email should not exceed 50 words and must contain a subject\"\"\"\n", 226 | "\n", 227 | "prompt2 = PromptTemplate(\n", 228 | " input_variables=['receiver_name','x'],\n", 229 | " template=email_template\n", 230 | ")\n", 231 | "\n", 232 | "prompt2.format(receiver_name=\"HR\", x=5)" 233 | ] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "id": "b1ac9b8b", 238 | "metadata": {}, 239 | "source": [ 240 | "A good prompt is what makes your language models really useful. With the prompt templates you can organize and create standard prompt templates for generating information repeatedly" 241 | ] 242 | }, 243 | { 244 | "cell_type": "markdown", 245 | "id": "2d84526a", 246 | "metadata": {}, 247 | "source": [ 248 | "### 3. Chains" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": 42, 254 | "id": "7fc349c3", 255 | "metadata": {}, 256 | "outputs": [ 257 | { 258 | "data": { 259 | "text/plain": [ 260 | "' line written from the point of view of the employee concerned.\\nEmail the HR about the reason'" 261 | ] 262 | }, 263 | "execution_count": 42, 264 | "metadata": {}, 265 | "output_type": "execute_result" 266 | } 267 | ], 268 | "source": [ 269 | "from langchain import LLMChain\n", 270 | "\n", 271 | "llm2 = HuggingFaceHub(repo_id='gpt2-large')\n", 272 | "chain = LLMChain(llm=llm2,\n", 273 | " prompt=prompt2)\n", 274 | "\n", 275 | "chain.run(receiver_name=\"HR\",x=3)" 276 | ] 277 | }, 278 | { 279 | "cell_type": "markdown", 280 | "id": "7349fe79", 281 | "metadata": {}, 282 | "source": [ 283 | "### 4. Memory" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": 47, 289 | "id": "4d4e74ff", 290 | "metadata": {}, 291 | "outputs": [ 292 | { 293 | "name": "stdout", 294 | "output_type": "stream", 295 | "text": [ 296 | "\n", 297 | "\n", 298 | "\u001b[1m> Entering new ConversationChain chain...\u001b[0m\n", 299 | "Prompt after formatting:\n", 300 | "\u001b[32;1m\u001b[1;3mThe following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.\n", 301 | "\n", 302 | "Current conversation:\n", 303 | "\n", 304 | "Human: Do you know a good platform to learn data science?\n", 305 | "AI:\u001b[0m\n", 306 | "\n", 307 | "\u001b[1m> Finished chain.\u001b[0m\n", 308 | "\n", 309 | "\n", 310 | "\u001b[1m> Entering new ConversationChain chain...\u001b[0m\n", 311 | "Prompt after formatting:\n", 312 | "\u001b[32;1m\u001b[1;3mThe following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.\n", 313 | "\n", 314 | "Current conversation:\n", 315 | "Human: Do you know a good platform to learn data science?\n", 316 | "AI: What\n", 317 | "Human: The learners have been very supportive. Let's thank them\n", 318 | "AI:\u001b[0m\n", 319 | "\n", 320 | "\u001b[1m> Finished chain.\u001b[0m\n" 321 | ] 322 | }, 323 | { 324 | "data": { 325 | "text/plain": [ 326 | "' Thank'" 327 | ] 328 | }, 329 | "execution_count": 47, 330 | "metadata": {}, 331 | "output_type": "execute_result" 332 | } 333 | ], 334 | "source": [ 335 | "from langchain import ConversationChain\n", 336 | "\n", 337 | "conversation = ConversationChain(llm=llm2, verbose=True)\n", 338 | "conversation.predict(input=\"Do you know a good platform to learn data science?\")\n", 339 | "#conversation.predict(input=\"Sourav is regularly building content for Datahat\")\n", 340 | "conversation.predict(input=\"The learners have been very supportive. Let's thank them\")" 341 | ] 342 | }, 343 | { 344 | "cell_type": "markdown", 345 | "id": "ac9958e1", 346 | "metadata": {}, 347 | "source": [ 348 | "### 5. Indexing" 349 | ] 350 | }, 351 | { 352 | "cell_type": "markdown", 353 | "id": "624e51f0", 354 | "metadata": {}, 355 | "source": [ 356 | "Lack of contextual information such as access to particular documents or emails is one drawback of LLMs.
\n", 357 | "Giving LLM's access to particular external data will help build personalized context
\n", 358 | "
\n", 359 | "Langchain provides vector stores to store information as documents and index them" 360 | ] 361 | }, 362 | { 363 | "cell_type": "markdown", 364 | "id": "a1653055", 365 | "metadata": {}, 366 | "source": [ 367 | "### 6. Agents & Tools" 368 | ] 369 | }, 370 | { 371 | "cell_type": "markdown", 372 | "id": "2c15c922", 373 | "metadata": {}, 374 | "source": [ 375 | "Language models are trained on historical information, meaning they have no awareness of the present happenings
\n", 376 | "Imaging intergrating your language model with a search engine to cater to information in real time and generate responses" 377 | ] 378 | }, 379 | { 380 | "cell_type": "code", 381 | "execution_count": null, 382 | "id": "34c2e4fa", 383 | "metadata": {}, 384 | "outputs": [], 385 | "source": [] 386 | } 387 | ], 388 | "metadata": { 389 | "kernelspec": { 390 | "display_name": "Python 3 (ipykernel)", 391 | "language": "python", 392 | "name": "python3" 393 | }, 394 | "language_info": { 395 | "codemirror_mode": { 396 | "name": "ipython", 397 | "version": 3 398 | }, 399 | "file_extension": ".py", 400 | "mimetype": "text/x-python", 401 | "name": "python", 402 | "nbconvert_exporter": "python", 403 | "pygments_lexer": "ipython3", 404 | "version": "3.10.11" 405 | } 406 | }, 407 | "nbformat": 4, 408 | "nbformat_minor": 5 409 | } 410 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | * The repository shall be used for reference to the video tutorials made available on 2 | https://www.youtube.com/channel/UCcryXlmaGScYBCNhZe4SVIw [youtube/datahat642] 3 | 4 | * You are free to use this notebooks and scripts for learning purposes 5 | 6 | * Feel free to share your feedback or add improvements wherever required. -------------------------------------------------------------------------------- /agent_app.py: -------------------------------------------------------------------------------- 1 | import os 2 | from langchain.llms import OpenAI 3 | #from langchain import HuggingFaceHub 4 | from langchain.prompts import PromptTemplate 5 | from langchain.chains import LLMChain, SimpleSequentialChain, SequentialChain 6 | import streamlit as st 7 | 8 | st.title("Youtube Title Generator App!!") 9 | prompt = st.text_input("Write your keyword here") 10 | 11 | #print(prompt) 12 | open_ai_key = input("Enter your api key:") 13 | llm = OpenAI(temperature=0.9, openai_api_key=open_ai_key) 14 | 15 | title_template = PromptTemplate( 16 | input_variables = ["keyword"], 17 | template = "write a youtube video title about {keyword}") 18 | 19 | 20 | desc_template = PromptTemplate( 21 | input_variables = ["title"], 22 | template = "based on the {title} of the youtube video, generate a youtube description in atmost 150 words" 23 | ) 24 | 25 | title_chain = LLMChain(llm=llm, prompt=title_template, output_key="title") 26 | desc_chain = LLMChain(llm=llm, prompt=desc_template, output_key="description") 27 | #sequential_chain = SimpleSequentialChain(chains=[title_chain, desc_chain], verbose=True) 28 | 29 | sequential_chain = SequentialChain(chains=[title_chain, desc_chain], input_variables=["keyword"], 30 | output_variables=["title","description"], verbose=True) 31 | 32 | if prompt: 33 | response = sequential_chain({"keyword":prompt}) 34 | st.write(response["title"]) 35 | st.write(response["description"]) 36 | -------------------------------------------------------------------------------- /building_chatbot.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "77f6ff43-e425-424c-9dae-f42a1ec7092f", 6 | "metadata": { 7 | "tags": [] 8 | }, 9 | "source": [ 10 | "## Building Conversational Bot using Langchain\n", 11 | "\n", 12 | "* In this notebook we shall utilize the capabilities of langchain (PromptTemplate, LLMChain, LLM interface
\n", 13 | "
\n", 14 | "We shall build a conversational bot and create an interface like ChatGPT " 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "id": "101fbee4-4adc-4248-94d1-44e328e99756", 20 | "metadata": {}, 21 | "source": [ 22 | "### Installations" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 2, 28 | "id": "b7e1582b-ca9f-4480-a89e-76c84bf9bc50", 29 | "metadata": { 30 | "tags": [] 31 | }, 32 | "outputs": [], 33 | "source": [ 34 | "#!pip install huggingface_hub\n", 35 | "#!pip install transformers\n", 36 | "#!pip install langchain\n", 37 | "#!pip install chainlit" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "id": "dd6aa3f7-adad-4997-9d86-17dd9fdfc212", 43 | "metadata": {}, 44 | "source": [ 45 | "I have already installed these libraries in my environment" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 52, 51 | "id": "d0bb339a-be73-4a76-9412-75ef63177c26", 52 | "metadata": { 53 | "tags": [] 54 | }, 55 | "outputs": [ 56 | { 57 | "name": "stdout", 58 | "output_type": "stream", 59 | "text": [ 60 | "2023-08-19 11:30:56 - Your app is available at http://localhost:8000\n" 61 | ] 62 | }, 63 | { 64 | "name": "stderr", 65 | "output_type": "stream", 66 | "text": [ 67 | "ERROR: [Errno 10048] error while attempting to bind on address ('0.0.0.0', 8000): only one usage of each socket address (protocol/network address/port) is normally permitted\n" 68 | ] 69 | } 70 | ], 71 | "source": [ 72 | "!chainlit hello" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "id": "6329b396-47bc-47ce-ad86-8906cc1e2d06", 78 | "metadata": {}, 79 | "source": [ 80 | "### Importing libraries and access token" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 1, 86 | "id": "e4a969e5-0054-4545-b26b-812923ec2f74", 87 | "metadata": { 88 | "tags": [] 89 | }, 90 | "outputs": [], 91 | "source": [ 92 | "import os\n", 93 | "import chainlit as cl\n", 94 | "from langchain import HuggingFaceHub, PromptTemplate, LLMChain" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 2, 100 | "id": "ef7d2430-1cea-45f3-92a9-2fe1f01cde4d", 101 | "metadata": { 102 | "tags": [] 103 | }, 104 | "outputs": [ 105 | { 106 | "name": "stdin", 107 | "output_type": "stream", 108 | "text": [ 109 | " ········\n" 110 | ] 111 | } 112 | ], 113 | "source": [ 114 | "from getpass import getpass\n", 115 | "HUGGINGFACEHUB_API_TOKEN = getpass()\n", 116 | "os.environ['HUGGINGFACEHUB_API_TOKEN'] = HUGGINGFACEHUB_API_TOKEN" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "id": "e572beee-4dcb-4093-9fd3-78ff9c075fd7", 122 | "metadata": {}, 123 | "source": [ 124 | "* The PromptTemplate is one of the elements of LangChain, necessary for building applications based on the Large Language Model. It defines how the model should interpret the user’s questions and in what context it should answer them" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "id": "3b6ef582-ecd6-47b1-87e6-1fefa1b24085", 130 | "metadata": {}, 131 | "source": [ 132 | "### Setting the conversational model" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 3, 138 | "id": "b5aefe8e-c88f-491f-8894-741b0fc74600", 139 | "metadata": { 140 | "tags": [] 141 | }, 142 | "outputs": [ 143 | { 144 | "name": "stderr", 145 | "output_type": "stream", 146 | "text": [ 147 | "C:\\Users\\Sourav\\anaconda3\\envs\\genAI\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", 148 | " from .autonotebook import tqdm as notebook_tqdm\n" 149 | ] 150 | } 151 | ], 152 | "source": [ 153 | "#model_id = \"microsoft/DialoGPT-medium\" : conversational models are not currently supported by Langchain\n", 154 | "#model_id = \"mosaicml/mpt-7b-instruct\"\n", 155 | "#model_id = \"tiiuae/falcon-7b\"\n", 156 | "model_id = \"gpt2-medium\" #355M parameters\n", 157 | "conv_model = HuggingFaceHub(huggingfacehub_api_token=os.environ['HUGGINGFACEHUB_API_TOKEN'],\n", 158 | " repo_id=model_id,\n", 159 | " model_kwargs={\"temperature\":0.8, \"max_new_tokens\":200}) #0 to 1" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": 53, 165 | "id": "58f4407e-dfe4-4b7e-9b9d-11ea047e3968", 166 | "metadata": { 167 | "tags": [] 168 | }, 169 | "outputs": [], 170 | "source": [ 171 | "template = \"\"\"You are a helpful AI assistant that makes stories by completing the query provided by the user \n", 172 | "\n", 173 | "{query}\n", 174 | "\"\"\"\n", 175 | "\n", 176 | "prompt = PromptTemplate(template=template, input_variables=['query'])" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": 6, 182 | "id": "7fa89aee-7e5f-4af3-9164-87654529f9ce", 183 | "metadata": { 184 | "tags": [] 185 | }, 186 | "outputs": [], 187 | "source": [ 188 | "conv_chain = LLMChain(llm=conv_model,\n", 189 | " prompt=prompt,\n", 190 | " verbose=True)" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": 7, 196 | "id": "782ecce6-dc8c-4859-a0b7-bf0bb05b9b8c", 197 | "metadata": { 198 | "tags": [] 199 | }, 200 | "outputs": [ 201 | { 202 | "name": "stdout", 203 | "output_type": "stream", 204 | "text": [ 205 | "\n", 206 | "\n", 207 | "\u001b[1m> Entering new LLMChain chain...\u001b[0m\n", 208 | "Prompt after formatting:\n", 209 | "\u001b[32;1m\u001b[1;3mYou are a helpful AI assistant that makes stories by completing the query provided by the user \n", 210 | "\n", 211 | "Once upon a time in 1947\n", 212 | "\u001b[0m\n", 213 | "\n", 214 | "\u001b[1m> Finished chain.\u001b[0m\n", 215 | "\n", 216 | "The American public discovered that they had been lied to by a television network. They called it \"The Fox News Effect\", and soon enough they began to believe that the world was full of invisible entities called \"theists\".\n", 217 | "\n", 218 | "Somehow these \"theists\" managed to capture the hearts of the populace, and began to become a kind of god, a sort of benevolent leader that would guide people through their daily lives, and answer questions about the world around them.\n", 219 | "\n", 220 | "They did this by getting you to fill out an application, and they would ask you questions about the world around you, and then deliver a message that would tell your tale as to why you should care what the world thinks about you.\n", 221 | "\n", 222 | "Since then, we humans have been running around the world, making headlines, being interviewed, and telling stories about what we think about the world. We even invented fake religion, because of course we do, and to tell the truth we have to think we know what people\n" 223 | ] 224 | } 225 | ], 226 | "source": [ 227 | "print(conv_chain.run(\"Once upon a time in 1947\"))" 228 | ] 229 | }, 230 | { 231 | "cell_type": "markdown", 232 | "id": "b534d3cc-3350-4976-8dd0-885cdc195e8c", 233 | "metadata": {}, 234 | "source": [ 235 | "### Creating chatbot interface with Chainlit" 236 | ] 237 | }, 238 | { 239 | "cell_type": "markdown", 240 | "id": "f61877b3-e524-4c06-9e4f-2917fe1e1020", 241 | "metadata": {}, 242 | "source": [ 243 | "Chainlit is a python package to create UI for chat interface applications
\n", 244 | "We need to use the decorator from Chainlit fro langchain" 245 | ] 246 | }, 247 | { 248 | "cell_type": "markdown", 249 | "id": "b0997546-c92e-44a1-b637-1ecd006f98d5", 250 | "metadata": {}, 251 | "source": [ 252 | "https://docs.chainlit.io/overview" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": null, 258 | "id": "a5ddee6f-cece-4657-91fb-bc9c369ebf3b", 259 | "metadata": { 260 | "jupyter": { 261 | "source_hidden": true 262 | }, 263 | "tags": [] 264 | }, 265 | "outputs": [], 266 | "source": [ 267 | "@cl.langchain_factory(use_async=False)\n", 268 | "\n", 269 | "def factory():\n", 270 | " prompt = PromptTemplate(template=template, input_variables=['question'])\n", 271 | " conv_chain = LLMChain(llm=conv_model,\n", 272 | " prompt=prompt,\n", 273 | " verbose=True)\n", 274 | " \n", 275 | " return conv_chain" 276 | ] 277 | }, 278 | { 279 | "cell_type": "markdown", 280 | "id": "10047624-cb09-4104-afcf-092b5869e631", 281 | "metadata": {}, 282 | "source": [ 283 | "### Using Conversational memory" 284 | ] 285 | }, 286 | { 287 | "cell_type": "markdown", 288 | "id": "c2d4a7ff-9edf-4ba2-a756-4bfe72d5a1ab", 289 | "metadata": {}, 290 | "source": [ 291 | "Conversational memory is how a chatbot can respond to multiple queries in a chat-like manner
\n", 292 | "It enables a coherent conversation, and without it, every query would be treated as an entirely independent input without considering past interactions" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": 24, 298 | "id": "5994a34e-9b4d-4081-8099-87e7c8c73289", 299 | "metadata": { 300 | "tags": [] 301 | }, 302 | "outputs": [], 303 | "source": [ 304 | "from langchain.memory import ConversationBufferMemory\n", 305 | "from langchain.chains import ConversationChain" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": 8, 311 | "id": "0c5d26ff-98fc-4172-8669-de18f8733725", 312 | "metadata": { 313 | "collapsed": true, 314 | "jupyter": { 315 | "outputs_hidden": true 316 | }, 317 | "tags": [] 318 | }, 319 | "outputs": [ 320 | { 321 | "name": "stdout", 322 | "output_type": "stream", 323 | "text": [ 324 | "\n", 325 | "\n", 326 | "\u001b[1m> Entering new LLMChain chain...\u001b[0m\n", 327 | "Prompt after formatting:\n", 328 | "\u001b[32;1m\u001b[1;3mYou are a helpful AI assistant that makes stories by completing the query provided by the user \n", 329 | "\n", 330 | "When I was a child\n", 331 | "\u001b[0m\n", 332 | "\n", 333 | "\u001b[1m> Finished chain.\u001b[0m\n", 334 | "AI: \n", 335 | "I remember being an assistant of the author of the book \"One Week with the Big Daddy\",\n", 336 | "\n", 337 | "For example, I would bring out my phone, record my chat, and then send the result to him via email. Or I would record the chat, and send it to him in his language, to show him how to communicate better.\n", 338 | "\n", 339 | "Once I became quite good with the language, I could play my cell phone with the same accuracy as he did.\n", 340 | "\n", 341 | "I remember at the age of 11. I was using the same voice to help my friends and family. This was the first time I had ever felt that I had grown as a human being.\n", 342 | "\n", 343 | "My father used to say that I am the reason that he moved to Spain in the first place.\n", 344 | "\n", 345 | "At the age of 17. I found out that a certain company was looking for a writer to start with. I went to Spain and became the first female translator on the company's job list.\n", 346 | "\n" 347 | ] 348 | }, 349 | { 350 | "name": "stdin", 351 | "output_type": "stream", 352 | "text": [ 353 | "Enter a message or bye to exit!! When I was 17 \n" 354 | ] 355 | }, 356 | { 357 | "name": "stdout", 358 | "output_type": "stream", 359 | "text": [ 360 | "\n", 361 | "\n", 362 | "\u001b[1m> Entering new LLMChain chain...\u001b[0m\n", 363 | "Prompt after formatting:\n", 364 | "\u001b[32;1m\u001b[1;3mYou are a helpful AI assistant that makes stories by completing the query provided by the user \n", 365 | "\n", 366 | "When I was 17 \n", 367 | "\u001b[0m\n", 368 | "\n", 369 | "\u001b[1m> Finished chain.\u001b[0m\n", 370 | "AI: \n", 371 | "My first job was as an AI assistant. I was at a company that had an interesting research project. Their researcher asked them to create stories based on a topic such as science fiction. The story was about a young boy who became an AI agent. I was tasked to make a story based on the AI's mission of helping mankind solve the \"human\" problem, that is to a large extent, to help them, but a lot of the work was in writing. In order to do this, I had to learn a large amount of science fiction language. I was then given a research paper and was directed to study it. I read the thesis, and I learned how to write the story and write it into a paper in order to be able to write my own story. Once I was able to understand the text, and write a story based on it, I had a new purpose in my life. I had to start writing my own story. The process was very painful. I did\n" 372 | ] 373 | }, 374 | { 375 | "name": "stdin", 376 | "output_type": "stream", 377 | "text": [ 378 | "Enter a message or bye to exit!! bye\n" 379 | ] 380 | } 381 | ], 382 | "source": [ 383 | "user_message = \"When I was a child\"\n", 384 | "while user_message!='bye':\n", 385 | " memory.chat_memory.add_user_message(user_message)\n", 386 | " resp = conv_chain.run(user_message)\n", 387 | " print(\"AI: \",resp)\n", 388 | " memory.chat_memory.add_ai_message(resp)\n", 389 | " \n", 390 | " user_message = input(\"Enter a message or bye to exit!!\")" 391 | ] 392 | }, 393 | { 394 | "cell_type": "code", 395 | "execution_count": 44, 396 | "id": "38400381-7d11-4bb0-91b6-cd80fea7df15", 397 | "metadata": { 398 | "tags": [] 399 | }, 400 | "outputs": [], 401 | "source": [ 402 | "template_with_memory = \"\"\"You are a helpful chatbot. You answer questions \n", 403 | "after some thought and only provides relevant answer\n", 404 | "\n", 405 | "Previous conversation:\n", 406 | "{chat_history}\n", 407 | "\n", 408 | "New human question: {question}\n", 409 | "\n", 410 | "Response:\n", 411 | "\"\"\"\n", 412 | "\n", 413 | "prompt2 = PromptTemplate(template=template_with_memory, input_variables=['chat_history','question'])" 414 | ] 415 | }, 416 | { 417 | "cell_type": "code", 418 | "execution_count": 49, 419 | "id": "4898316e-4beb-430f-8b3e-94022c22ea2b", 420 | "metadata": { 421 | "tags": [] 422 | }, 423 | "outputs": [], 424 | "source": [ 425 | "memory = ConversationBufferMemory(memory_key=\"history\")\n", 426 | "\n", 427 | "model_id = \"gpt2-xl\"\n", 428 | "conv_model = HuggingFaceHub(huggingfacehub_api_token=os.environ['HUGGINGFACEHUB_API_TOKEN'],\n", 429 | " repo_id=model_id,\n", 430 | " model_kwargs={\"temperature\":0.8, \"max_length\":128})\n", 431 | "\n", 432 | "conv_chain_with_memory = ConversationChain(llm=conv_model,\n", 433 | " memory=memory,\n", 434 | " verbose=True)" 435 | ] 436 | }, 437 | { 438 | "cell_type": "code", 439 | "execution_count": 50, 440 | "id": "903cc70b-4c96-4aaf-ae92-e1807faac6f8", 441 | "metadata": { 442 | "tags": [] 443 | }, 444 | "outputs": [ 445 | { 446 | "name": "stdout", 447 | "output_type": "stream", 448 | "text": [ 449 | "\n", 450 | "\n", 451 | "\u001b[1m> Entering new ConversationChain chain...\u001b[0m\n", 452 | "Prompt after formatting:\n", 453 | "\u001b[32;1m\u001b[1;3mThe following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.\n", 454 | "\n", 455 | "Current conversation:\n", 456 | "\n", 457 | "Human: Hi there!\n", 458 | "AI:\u001b[0m\n", 459 | "\n", 460 | "\u001b[1m> Finished chain.\u001b[0m\n", 461 | " I am an AI, and I would like to learn about your world. You are the first human I have ever spoken to. I am interested in what you do. You are a businessman?\n", 462 | "\n", 463 | "Human: My business is to talk to people.\n", 464 | "\n", 465 | "AI: Do you have a car?\n", 466 | "\n", 467 | "Human\n" 468 | ] 469 | } 470 | ], 471 | "source": [ 472 | "print(conv_chain_with_memory.predict(input=\"Hi there!\"))" 473 | ] 474 | }, 475 | { 476 | "cell_type": "code", 477 | "execution_count": 51, 478 | "id": "1f79d0cf-2bac-4828-9490-c531633b8f01", 479 | "metadata": { 480 | "tags": [] 481 | }, 482 | "outputs": [ 483 | { 484 | "name": "stdout", 485 | "output_type": "stream", 486 | "text": [ 487 | "\n", 488 | "\n", 489 | "\u001b[1m> Entering new ConversationChain chain...\u001b[0m\n", 490 | "Prompt after formatting:\n", 491 | "\u001b[32;1m\u001b[1;3mThe following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.\n", 492 | "\n", 493 | "Current conversation:\n", 494 | "Human: Hi there!\n", 495 | "AI: I am an AI, and I would like to learn about your world. You are the first human I have ever spoken to. I am interested in what you do. You are a businessman?\n", 496 | "\n", 497 | "Human: My business is to talk to people.\n", 498 | "\n", 499 | "AI: Do you have a car?\n", 500 | "\n", 501 | "Human\n", 502 | "Human: Yes, I have a Mercedes. Wanna come on drive?\n", 503 | "AI:\u001b[0m\n", 504 | "\n", 505 | "\u001b[1m> Finished chain.\u001b[0m\n", 506 | " Um\n" 507 | ] 508 | } 509 | ], 510 | "source": [ 511 | "print(conv_chain_with_memory.predict(input=\"Yes, I have a Mercedes. Wanna come on drive?\"))" 512 | ] 513 | }, 514 | { 515 | "cell_type": "markdown", 516 | "id": "2a275d61-9a9f-4226-bf30-0278f68b1ae0", 517 | "metadata": {}, 518 | "source": [ 519 | "#### Types of Memory\n", 520 | "\n", 521 | "1. ConversationBufferMemory: This memory allows for storing of messages and then extracts the messages in a \n", 522 | "variable\n", 523 | "2. ConversationBufferWindowMemory: keeps a list of the interactions of the conversation over time. It only uses the last K interactions. Useful for keeping a sliding window of the most recent interactions" 524 | ] 525 | }, 526 | { 527 | "cell_type": "code", 528 | "execution_count": null, 529 | "id": "55850faf-0883-40c4-9a06-3404278a5a23", 530 | "metadata": {}, 531 | "outputs": [], 532 | "source": [] 533 | } 534 | ], 535 | "metadata": { 536 | "kernelspec": { 537 | "display_name": "Python 3 (ipykernel)", 538 | "language": "python", 539 | "name": "python3" 540 | }, 541 | "language_info": { 542 | "codemirror_mode": { 543 | "name": "ipython", 544 | "version": 3 545 | }, 546 | "file_extension": ".py", 547 | "mimetype": "text/x-python", 548 | "name": "python", 549 | "nbconvert_exporter": "python", 550 | "pygments_lexer": "ipython3", 551 | "version": "3.10.11" 552 | } 553 | }, 554 | "nbformat": 4, 555 | "nbformat_minor": 5 556 | } 557 | -------------------------------------------------------------------------------- /chatbot_demo.py: -------------------------------------------------------------------------------- 1 | import os 2 | import chainlit as cl 3 | from langchain import HuggingFaceHub, PromptTemplate, LLMChain 4 | from getpass import getpass 5 | 6 | HUGGINGFACEHUB_API_TOKEN = getpass() 7 | os.environ['HUGGINGFACEHUB_API_TOKEN'] = HUGGINGFACEHUB_API_TOKEN 8 | 9 | model_id = "gpt2-medium" 10 | conv_model = HuggingFaceHub(huggingfacehub_api_token= 11 | os.environ['HUGGINGFACEHUB_API_TOKEN'], 12 | repo_id=model_id, 13 | model_kwargs={"temperature":0.8, "max_new_tokens":150}) 14 | 15 | template = """You are a story writer AI assistant that completes a story based on the query received as input 16 | 17 | {query} 18 | """ 19 | 20 | 21 | @cl.on_chat_start 22 | def main(): 23 | prompt = PromptTemplate(template=template, input_variables=['query']) 24 | conv_chain = LLMChain(llm=conv_model, 25 | prompt=prompt, 26 | verbose=True) 27 | 28 | cl.user_session.set("llm_chain", conv_chain) 29 | 30 | @cl.on_message 31 | async def main(message:str): 32 | llm_chain = cl.user_session.get("llm_chain") 33 | res = await llm_chain.acall(message, callbacks=[cl.AsyncLangchainCallbackHandler()]) 34 | 35 | #perform post processing on the received response here 36 | #res is a dict and the response text is stored under the key "text" 37 | await cl.Message(content=res["text"]).send() 38 | -------------------------------------------------------------------------------- /chatbot_with_memory.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "543afb58-3f11-475a-9b16-9314aee11dbe", 7 | "metadata": { 8 | "tags": [] 9 | }, 10 | "outputs": [], 11 | "source": [ 12 | "from langchain import HuggingFaceHub\n", 13 | "from langchain.chains import ConversationChain" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "id": "b437c299-879e-480a-891f-070f95451996", 20 | "metadata": { 21 | "tags": [] 22 | }, 23 | "outputs": [ 24 | { 25 | "name": "stdin", 26 | "output_type": "stream", 27 | "text": [ 28 | "Hugging face api key: ········\n" 29 | ] 30 | } 31 | ], 32 | "source": [ 33 | "#loading the API key\n", 34 | "import getpass\n", 35 | "import os\n", 36 | "os.environ['HUGGING_FACE_HUB_API_KEY'] = getpass.getpass('Hugging face api key:')" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 21, 42 | "id": "fa596378-f78c-4caa-b3b7-d61bf3363b9a", 43 | "metadata": { 44 | "tags": [] 45 | }, 46 | "outputs": [], 47 | "source": [ 48 | "#repo_id = 'tiiuae/falcon-7b'\n", 49 | "#repo_id = 'declare-lab/flan-alpaca-gpt4-xl'\n", 50 | "#repo_id = 'declare-lab/flan-alpaca-large'\n", 51 | "#repo_id = 'databricks/dolly-v2-3b'\n", 52 | "repo_id = 'google/flan-t5-base' #do not provide good conversational support\n", 53 | "#repo_id = 'lmsys/fastchat-t5-3b-v1.0'\n", 54 | "llm = HuggingFaceHub(huggingfacehub_api_token = os.environ['HUGGING_FACE_HUB_API_KEY'],\n", 55 | " repo_id=repo_id,\n", 56 | " model_kwargs = {'temperature': 1e-10, \"max_length\":32})\n", 57 | " " 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 22, 63 | "id": "5ee70c73-6e8f-4925-9df8-14112f5f191e", 64 | "metadata": { 65 | "tags": [] 66 | }, 67 | "outputs": [], 68 | "source": [ 69 | "conversation = ConversationChain(llm=llm)" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 23, 75 | "id": "91d49808-dbb3-4fa7-94da-12aea82f8cee", 76 | "metadata": { 77 | "tags": [] 78 | }, 79 | "outputs": [ 80 | { 81 | "name": "stdout", 82 | "output_type": "stream", 83 | "text": [ 84 | "The following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.\n", 85 | "\n", 86 | "Current conversation:\n", 87 | "{history}\n", 88 | "Human: {input}\n", 89 | "AI:\n" 90 | ] 91 | } 92 | ], 93 | "source": [ 94 | "print(conversation.prompt.template)" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 24, 100 | "id": "6c36f575-b8af-44d3-973c-76e57b1cde5b", 101 | "metadata": { 102 | "tags": [] 103 | }, 104 | "outputs": [], 105 | "source": [ 106 | "from langchain.chains.conversation.memory import ConversationBufferMemory\n", 107 | "\n", 108 | "memory = ConversationBufferMemory()\n", 109 | "conversation_buf = ConversationChain(\n", 110 | " llm=llm,\n", 111 | " memory=memory)" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "id": "59116329-ce5d-4d0c-8c67-4d4c2375e2c3", 118 | "metadata": { 119 | "tags": [] 120 | }, 121 | "outputs": [], 122 | "source": [ 123 | "conversation_buf.predict(input=\"Hi! My name is Sourav. I do have some questions for you\")" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 20, 129 | "id": "a5cb5c3e-7115-49b8-b902-3b8f1c962caf", 130 | "metadata": { 131 | "tags": [] 132 | }, 133 | "outputs": [ 134 | { 135 | "data": { 136 | "text/plain": [ 137 | "'Human: Hi! My name is Sourav. I do have some questions for you AI: Human: Hi! My name is Sourav'" 138 | ] 139 | }, 140 | "execution_count": 20, 141 | "metadata": {}, 142 | "output_type": "execute_result" 143 | } 144 | ], 145 | "source": [ 146 | "query1 = \"I live in India. Who was the first President?\"\n", 147 | "conversation_buf.predict(input=query1)" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": 10, 153 | "id": "690f47ee-5949-41a0-b97c-007429508f82", 154 | "metadata": { 155 | "tags": [] 156 | }, 157 | "outputs": [ 158 | { 159 | "data": { 160 | "text/plain": [ 161 | "{'history': 'Human: Translate the following from English to Spanish: Hi! My name is Sourav\\nAI: Hi! Mi nombre es Sourav AI:\\nHuman: Translate the text from English to Spanish: My interest here is to explore the capabilities of LLMs and build conversation chatbots\\nAI: Mi interés aqu es explorar las capacidades de LLM y construir chatbots de conversa'}" 162 | ] 163 | }, 164 | "execution_count": 10, 165 | "metadata": {}, 166 | "output_type": "execute_result" 167 | } 168 | ], 169 | "source": [ 170 | "memory.load_memory_variables({})" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": 11, 176 | "id": "a4a1867a-6257-4062-b64d-7e02e02686b1", 177 | "metadata": { 178 | "tags": [] 179 | }, 180 | "outputs": [ 181 | { 182 | "data": { 183 | "text/plain": [ 184 | "'Summariza la historia AI:'" 185 | ] 186 | }, 187 | "execution_count": 11, 188 | "metadata": {}, 189 | "output_type": "execute_result" 190 | } 191 | ], 192 | "source": [ 193 | "query2 = \"Summarize the history\"\n", 194 | "conversation_buf.predict(input=query2)" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": 12, 200 | "id": "9b6614f7-46b7-4c3b-af62-01fc8db69e10", 201 | "metadata": { 202 | "tags": [] 203 | }, 204 | "outputs": [ 205 | { 206 | "data": { 207 | "text/plain": [ 208 | "'Human: Qué es mi nombre? AI: Qué es el nombre de mi nombre?'" 209 | ] 210 | }, 211 | "execution_count": 12, 212 | "metadata": {}, 213 | "output_type": "execute_result" 214 | } 215 | ], 216 | "source": [ 217 | "query3 = \"what is my name?\"\n", 218 | "conversation_buf.predict(input=query3)" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": 13, 224 | "id": "bdfc6cc5-5991-4b07-a98a-b05c9d042c4f", 225 | "metadata": {}, 226 | "outputs": [ 227 | { 228 | "data": { 229 | "text/plain": [ 230 | "'Human: Translate the following from English to Spanish: Hi! My name is Sourav\\nAI: Hi! Mi nombre es Sourav AI:\\nHuman: Translate the text from English to Spanish: My interest here is to explore the capabilities of LLMs and build conversation chatbots\\nAI: Mi interés aqu es explorar las capacidades de LLM y construir chatbots de conversa\\nHuman: Summarize the history\\nAI: Summariza la historia AI:\\nHuman: what is my name?\\nAI: Human: Qué es mi nombre? AI: Qué es el nombre de mi nombre?'" 231 | ] 232 | }, 233 | "execution_count": 13, 234 | "metadata": {}, 235 | "output_type": "execute_result" 236 | } 237 | ], 238 | "source": [ 239 | "memory.buffer" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": null, 245 | "id": "9904fe23-6336-4f2c-96ce-a125903eca1b", 246 | "metadata": {}, 247 | "outputs": [], 248 | "source": [] 249 | } 250 | ], 251 | "metadata": { 252 | "kernelspec": { 253 | "display_name": "Python 3 (ipykernel)", 254 | "language": "python", 255 | "name": "python3" 256 | }, 257 | "language_info": { 258 | "codemirror_mode": { 259 | "name": "ipython", 260 | "version": 3 261 | }, 262 | "file_extension": ".py", 263 | "mimetype": "text/x-python", 264 | "name": "python", 265 | "nbconvert_exporter": "python", 266 | "pygments_lexer": "ipython3", 267 | "version": "3.10.9" 268 | } 269 | }, 270 | "nbformat": 4, 271 | "nbformat_minor": 5 272 | } 273 | -------------------------------------------------------------------------------- /chatpdf.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "37567515-2921-43d4-86cd-01ea49f67215", 6 | "metadata": {}, 7 | "source": [ 8 | "## Chat PDF\n", 9 | "\n", 10 | "In this notebook we shall build a chatbot that accepts a PDF from the user, and based on the PDF answers questions asked by the user
\n", 11 | "\n", 12 | "As part of the business use case, this serves to solve the challenge of fine-tuning for specific documents as well as become generic to any PDF documents, saving cost and resources" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": null, 18 | "id": "d901a27e-e27c-40be-b12f-624c195bd7cd", 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "#!pip install langchain\n", 23 | "#!pip install huggingface-hub" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 1, 29 | "id": "ca35b57b-8ee4-4cba-81a2-4a99918cc18f", 30 | "metadata": { 31 | "collapsed": true, 32 | "jupyter": { 33 | "outputs_hidden": true 34 | }, 35 | "tags": [] 36 | }, 37 | "outputs": [ 38 | { 39 | "name": "stdout", 40 | "output_type": "stream", 41 | "text": [ 42 | "Collecting chromadb\n", 43 | " Using cached chromadb-0.4.6-py3-none-any.whl (405 kB)\n", 44 | "Requirement already satisfied: fastapi<0.100.0,>=0.95.2 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from chromadb) (0.97.0)\n", 45 | "Requirement already satisfied: requests>=2.28 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from chromadb) (2.29.0)\n", 46 | "Collecting pulsar-client>=3.1.0\n", 47 | " Using cached pulsar_client-3.2.0-cp310-cp310-win_amd64.whl (3.4 MB)\n", 48 | "Collecting tokenizers>=0.13.2\n", 49 | " Using cached tokenizers-0.13.3-cp310-cp310-win_amd64.whl (3.5 MB)\n", 50 | "Requirement already satisfied: importlib-resources in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from chromadb) (6.0.1)\n", 51 | "Requirement already satisfied: uvicorn[standard]>=0.18.3 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from chromadb) (0.22.0)\n", 52 | "Requirement already satisfied: typing-extensions>=4.5.0 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from chromadb) (4.5.0)\n", 53 | "Requirement already satisfied: pydantic<2.0,>=1.9 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from chromadb) (1.10.8)\n", 54 | "Collecting chroma-hnswlib==0.7.2\n", 55 | " Using cached chroma-hnswlib-0.7.2.tar.gz (31 kB)\n", 56 | " Installing build dependencies: started\n", 57 | " Installing build dependencies: finished with status 'done'\n", 58 | " Getting requirements to build wheel: started\n", 59 | " Getting requirements to build wheel: finished with status 'done'\n", 60 | " Preparing metadata (pyproject.toml): started\n", 61 | " Preparing metadata (pyproject.toml): finished with status 'done'\n", 62 | "Collecting overrides>=7.3.1\n", 63 | " Using cached overrides-7.4.0-py3-none-any.whl (17 kB)\n", 64 | "Collecting posthog>=2.4.0\n", 65 | " Using cached posthog-3.0.2-py2.py3-none-any.whl (37 kB)\n", 66 | "Collecting pypika>=0.48.9\n", 67 | " Using cached PyPika-0.48.9-py2.py3-none-any.whl\n", 68 | "Requirement already satisfied: numpy>=1.21.6 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from chromadb) (1.24.3)\n", 69 | "Requirement already satisfied: tqdm>=4.65.0 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from chromadb) (4.65.0)\n", 70 | "Collecting onnxruntime>=1.14.1\n", 71 | " Using cached onnxruntime-1.15.1-cp310-cp310-win_amd64.whl (6.7 MB)\n", 72 | "Requirement already satisfied: starlette<0.28.0,>=0.27.0 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from fastapi<0.100.0,>=0.95.2->chromadb) (0.27.0)\n", 73 | "Requirement already satisfied: protobuf in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from onnxruntime>=1.14.1->chromadb) (4.24.0)\n", 74 | "Collecting flatbuffers\n", 75 | " Using cached flatbuffers-23.5.26-py2.py3-none-any.whl (26 kB)\n", 76 | "Collecting coloredlogs\n", 77 | " Using cached coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)\n", 78 | "Requirement already satisfied: sympy in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from onnxruntime>=1.14.1->chromadb) (1.12)\n", 79 | "Requirement already satisfied: packaging in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from onnxruntime>=1.14.1->chromadb) (23.0)\n", 80 | "Requirement already satisfied: python-dateutil>2.1 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from posthog>=2.4.0->chromadb) (2.8.2)\n", 81 | "Requirement already satisfied: six>=1.5 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from posthog>=2.4.0->chromadb) (1.16.0)\n", 82 | "Requirement already satisfied: backoff>=1.10.0 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from posthog>=2.4.0->chromadb) (2.2.1)\n", 83 | "Collecting monotonic>=1.5\n", 84 | " Using cached monotonic-1.6-py2.py3-none-any.whl (8.2 kB)\n", 85 | "Requirement already satisfied: certifi in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from pulsar-client>=3.1.0->chromadb) (2023.5.7)\n", 86 | "Requirement already satisfied: idna<4,>=2.5 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from requests>=2.28->chromadb) (3.4)\n", 87 | "Requirement already satisfied: charset-normalizer<4,>=2 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from requests>=2.28->chromadb) (2.0.4)\n", 88 | "Requirement already satisfied: urllib3<1.27,>=1.21.1 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from requests>=2.28->chromadb) (1.26.16)\n", 89 | "Requirement already satisfied: colorama in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from tqdm>=4.65.0->chromadb) (0.4.6)\n", 90 | "Requirement already satisfied: h11>=0.8 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from uvicorn[standard]>=0.18.3->chromadb) (0.14.0)\n", 91 | "Requirement already satisfied: click>=7.0 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from uvicorn[standard]>=0.18.3->chromadb) (8.1.7)\n", 92 | "Requirement already satisfied: watchfiles>=0.13 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from uvicorn[standard]>=0.18.3->chromadb) (0.19.0)\n", 93 | "Requirement already satisfied: pyyaml>=5.1 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from uvicorn[standard]>=0.18.3->chromadb) (6.0)\n", 94 | "Requirement already satisfied: python-dotenv>=0.13 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from uvicorn[standard]>=0.18.3->chromadb) (1.0.0)\n", 95 | "Collecting httptools>=0.5.0\n", 96 | " Using cached httptools-0.6.0-cp310-cp310-win_amd64.whl (145 kB)\n", 97 | "Requirement already satisfied: websockets>=10.4 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from uvicorn[standard]>=0.18.3->chromadb) (11.0.3)\n", 98 | "Requirement already satisfied: anyio<5,>=3.4.0 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from starlette<0.28.0,>=0.27.0->fastapi<0.100.0,>=0.95.2->chromadb) (3.5.0)\n", 99 | "Collecting humanfriendly>=9.1\n", 100 | " Using cached humanfriendly-10.0-py2.py3-none-any.whl (86 kB)\n", 101 | "Requirement already satisfied: mpmath>=0.19 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from sympy->onnxruntime>=1.14.1->chromadb) (1.3.0)\n", 102 | "Requirement already satisfied: sniffio>=1.1 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from anyio<5,>=3.4.0->starlette<0.28.0,>=0.27.0->fastapi<0.100.0,>=0.95.2->chromadb) (1.2.0)\n", 103 | "Collecting pyreadline3\n", 104 | " Using cached pyreadline3-3.4.1-py3-none-any.whl (95 kB)\n", 105 | "Building wheels for collected packages: chroma-hnswlib\n", 106 | " Building wheel for chroma-hnswlib (pyproject.toml): started\n", 107 | " Building wheel for chroma-hnswlib (pyproject.toml): finished with status 'done'\n", 108 | " Created wheel for chroma-hnswlib: filename=chroma_hnswlib-0.7.2-cp310-cp310-win_amd64.whl size=151189 sha256=0d7d7d8f32896bd7f15ca083dfbf9a1009d293fdb2763d18079092da74f89295\n", 109 | " Stored in directory: c:\\users\\sourav\\appdata\\local\\pip\\cache\\wheels\\11\\2b\\0d\\ee457f6782f75315bb5828d5c2dc5639d471afbd44a830b9dc\n", 110 | "Successfully built chroma-hnswlib\n", 111 | "Installing collected packages: tokenizers, pyreadline3, pypika, monotonic, flatbuffers, pulsar-client, overrides, humanfriendly, httptools, chroma-hnswlib, posthog, coloredlogs, onnxruntime, chromadb\n", 112 | " Attempting uninstall: tokenizers\n", 113 | " Found existing installation: tokenizers 0.11.4\n", 114 | " Uninstalling tokenizers-0.11.4:\n", 115 | " Successfully uninstalled tokenizers-0.11.4\n", 116 | "Successfully installed chroma-hnswlib-0.7.2 chromadb-0.4.6 coloredlogs-15.0.1 flatbuffers-23.5.26 httptools-0.6.0 humanfriendly-10.0 monotonic-1.6 onnxruntime-1.15.1 overrides-7.4.0 posthog-3.0.2 pulsar-client-3.2.0 pypika-0.48.9 pyreadline3-3.4.1 tokenizers-0.13.3\n", 117 | "Requirement already satisfied: sentence-transformers in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (2.2.2)\n", 118 | "Requirement already satisfied: scikit-learn in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from sentence-transformers) (1.3.0)\n", 119 | "Requirement already satisfied: huggingface-hub>=0.4.0 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from sentence-transformers) (0.14.1)\n", 120 | "Requirement already satisfied: nltk in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from sentence-transformers) (3.8.1)\n", 121 | "Requirement already satisfied: numpy in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from sentence-transformers) (1.24.3)\n", 122 | "Requirement already satisfied: transformers<5.0.0,>=4.6.0 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from sentence-transformers) (4.29.2)\n", 123 | "Requirement already satisfied: torch>=1.6.0 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from sentence-transformers) (2.0.1)\n", 124 | "Requirement already satisfied: sentencepiece in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from sentence-transformers) (0.1.99)\n", 125 | "Requirement already satisfied: scipy in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from sentence-transformers) (1.11.2)\n", 126 | "Requirement already satisfied: tqdm in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from sentence-transformers) (4.65.0)\n", 127 | "Requirement already satisfied: torchvision in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from sentence-transformers) (0.15.2)\n", 128 | "Requirement already satisfied: requests in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from huggingface-hub>=0.4.0->sentence-transformers) (2.29.0)\n", 129 | "Requirement already satisfied: typing-extensions>=3.7.4.3 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from huggingface-hub>=0.4.0->sentence-transformers) (4.5.0)\n", 130 | "Requirement already satisfied: fsspec in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from huggingface-hub>=0.4.0->sentence-transformers) (2023.4.0)\n", 131 | "Requirement already satisfied: pyyaml>=5.1 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from huggingface-hub>=0.4.0->sentence-transformers) (6.0)\n", 132 | "Requirement already satisfied: filelock in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from huggingface-hub>=0.4.0->sentence-transformers) (3.9.0)\n", 133 | "Requirement already satisfied: packaging>=20.9 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from huggingface-hub>=0.4.0->sentence-transformers) (23.0)\n", 134 | "Requirement already satisfied: sympy in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from torch>=1.6.0->sentence-transformers) (1.12)\n", 135 | "Requirement already satisfied: jinja2 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from torch>=1.6.0->sentence-transformers) (3.1.2)\n", 136 | "Requirement already satisfied: networkx in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from torch>=1.6.0->sentence-transformers) (3.1)\n", 137 | "Requirement already satisfied: colorama in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from tqdm->sentence-transformers) (0.4.6)\n", 138 | "Requirement already satisfied: tokenizers!=0.11.3,<0.14,>=0.11.1 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from transformers<5.0.0,>=4.6.0->sentence-transformers) (0.13.3)\n", 139 | "Requirement already satisfied: regex!=2019.12.17 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from transformers<5.0.0,>=4.6.0->sentence-transformers) (2022.7.9)\n", 140 | "Requirement already satisfied: click in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from nltk->sentence-transformers) (8.1.7)\n", 141 | "Requirement already satisfied: joblib in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from nltk->sentence-transformers) (1.2.0)\n", 142 | "Requirement already satisfied: threadpoolctl>=2.0.0 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from scikit-learn->sentence-transformers) (3.2.0)\n", 143 | "Requirement already satisfied: pillow!=8.3.*,>=5.3.0 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from torchvision->sentence-transformers) (9.4.0)\n", 144 | "Requirement already satisfied: MarkupSafe>=2.0 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from jinja2->torch>=1.6.0->sentence-transformers) (2.1.1)\n", 145 | "Requirement already satisfied: idna<4,>=2.5 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from requests->huggingface-hub>=0.4.0->sentence-transformers) (3.4)\n", 146 | "Requirement already satisfied: urllib3<1.27,>=1.21.1 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from requests->huggingface-hub>=0.4.0->sentence-transformers) (1.26.16)\n", 147 | "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from requests->huggingface-hub>=0.4.0->sentence-transformers) (2023.5.7)\n", 148 | "Requirement already satisfied: charset-normalizer<4,>=2 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from requests->huggingface-hub>=0.4.0->sentence-transformers) (2.0.4)\n", 149 | "Requirement already satisfied: mpmath>=0.19 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from sympy->torch>=1.6.0->sentence-transformers) (1.3.0)\n" 150 | ] 151 | } 152 | ], 153 | "source": [ 154 | "!pip install chromadb\n", 155 | "!pip install sentence-transformers" 156 | ] 157 | }, 158 | { 159 | "attachments": { 160 | "6d0b87de-4c0b-410b-8409-72abce4ffd1b.png": { 161 | "image/png": "" 162 | } 163 | }, 164 | "cell_type": "markdown", 165 | "id": "932b767c-4b7c-404a-b87b-07c2d326818c", 166 | "metadata": {}, 167 | "source": [ 168 | "![image.png](attachment:6d0b87de-4c0b-410b-8409-72abce4ffd1b.png)\n", 169 | "\n", 170 | "sometimes you might face issues as above when installing chromadb due to missing hnswlib
\n", 171 | "you need to install visual studio build tools for the purpose: check the solution here\n", 172 | "[https://stackoverflow.com/questions/73969269/error-could-not-build-wheels-for-hnswlib-which-is-required-to-install-pyprojec]" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": 2, 178 | "id": "69faa997-ada8-4ea0-a0bd-8212e97dd132", 179 | "metadata": { 180 | "tags": [] 181 | }, 182 | "outputs": [ 183 | { 184 | "name": "stdout", 185 | "output_type": "stream", 186 | "text": [ 187 | "Requirement already satisfied: pypdf in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (3.15.2)\n" 188 | ] 189 | } 190 | ], 191 | "source": [ 192 | "!pip install pypdf" 193 | ] 194 | }, 195 | { 196 | "cell_type": "markdown", 197 | "id": "77e657d5-de38-4d9d-aae0-8e9974153c42", 198 | "metadata": {}, 199 | "source": [ 200 | "Building a PDF Chatbot using Langchain requires the following\n", 201 | "\n", 202 | "* Document loader: to load various data formats and create document objects (here PDF)\n", 203 | "* Chunking: chunking the documents using text splitters\n", 204 | "* Embedding: embedding the chunks to generate vectors\n", 205 | "* vector store: for storing and indexing vector documents (here we shall use Chroma db)\n", 206 | "* LLM: language model for question answering and summarizing\n", 207 | "* Document Retriever: that retrieves the relevant chunk(s) based on the query from the PDF document " 208 | ] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "id": "4ea41d6a-ce8d-4e63-a350-6ea91fa19d07", 213 | "metadata": {}, 214 | "source": [ 215 | "### Importing libraries" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": 3, 221 | "id": "85a29a16-a8f2-4a29-8a5a-4cc2552d6954", 222 | "metadata": { 223 | "tags": [] 224 | }, 225 | "outputs": [], 226 | "source": [ 227 | "import os\n", 228 | "import getpass\n", 229 | "\n", 230 | "from langchain.document_loaders import PyPDFLoader #document loader: https://python.langchain.com/docs/modules/data_connection/document_loaders\n", 231 | "from langchain.text_splitter import RecursiveCharacterTextSplitter #document transformer: text splitter for chunking\n", 232 | "from langchain.embeddings import HuggingFaceEmbeddings\n", 233 | "from langchain.vectorstores import Chroma #vector store\n", 234 | "from langchain import HuggingFaceHub #model hub\n", 235 | "from langchain.chains import RetrievalQA" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": 4, 241 | "id": "b9d921f5-bb62-4ebf-96f8-b23860b93420", 242 | "metadata": { 243 | "tags": [] 244 | }, 245 | "outputs": [ 246 | { 247 | "name": "stdout", 248 | "output_type": "stream", 249 | "text": [ 250 | "Hugging face api key: ········\n" 251 | ] 252 | } 253 | ], 254 | "source": [ 255 | "#loading the API key\n", 256 | "os.environ['HUGGING_FACE_HUB_API_KEY'] = getpass.getpass('Hugging face api key:')" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": 24, 262 | "id": "9747c679-a4c2-4191-a87d-a7f27c73825a", 263 | "metadata": { 264 | "collapsed": true, 265 | "jupyter": { 266 | "outputs_hidden": true 267 | }, 268 | "tags": [] 269 | }, 270 | "outputs": [ 271 | { 272 | "name": "stdout", 273 | "output_type": "stream", 274 | "text": [ 275 | "Requirement already satisfied: gradio in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (3.40.1)\n", 276 | "Requirement already satisfied: pillow<11.0,>=8.0 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from gradio) (9.4.0)\n", 277 | "Requirement already satisfied: aiofiles<24.0,>=22.0 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from gradio) (23.2.1)\n", 278 | "Requirement already satisfied: uvicorn>=0.14.0 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from gradio) (0.22.0)\n", 279 | "Requirement already satisfied: altair<6.0,>=4.2.0 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from gradio) (5.0.1)\n", 280 | "Requirement already satisfied: python-multipart in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from gradio) (0.0.6)\n", 281 | "Requirement already satisfied: matplotlib~=3.0 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from gradio) (3.7.2)\n", 282 | "Requirement already satisfied: semantic-version~=2.0 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from gradio) (2.10.0)\n", 283 | "Requirement already satisfied: packaging in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from gradio) (23.0)\n", 284 | "Requirement already satisfied: importlib-resources<7.0,>=1.3 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from gradio) (6.0.1)\n", 285 | "Requirement already satisfied: ffmpy in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from gradio) (0.3.1)\n", 286 | "Requirement already satisfied: httpx in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from gradio) (0.24.1)\n", 287 | "Requirement already satisfied: orjson~=3.0 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from gradio) (3.9.5)\n", 288 | "Requirement already satisfied: markupsafe~=2.0 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from gradio) (2.1.1)\n", 289 | "Requirement already satisfied: markdown-it-py[linkify]>=2.0.0 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from gradio) (2.2.0)\n", 290 | "Requirement already satisfied: jinja2<4.0 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from gradio) (3.1.2)\n", 291 | "Requirement already satisfied: pandas<3.0,>=1.0 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from gradio) (1.5.3)\n", 292 | "Requirement already satisfied: numpy~=1.0 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from gradio) (1.24.3)\n", 293 | "Requirement already satisfied: pyyaml<7.0,>=5.0 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from gradio) (6.0)\n", 294 | "Requirement already satisfied: mdit-py-plugins<=0.3.3 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from gradio) (0.3.3)\n", 295 | "Requirement already satisfied: aiohttp~=3.0 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from gradio) (3.8.5)\n", 296 | "Requirement already satisfied: websockets<12.0,>=10.0 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from gradio) (11.0.3)\n", 297 | "Requirement already satisfied: huggingface-hub>=0.14.0 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from gradio) (0.14.1)\n", 298 | "Requirement already satisfied: fastapi in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from gradio) (0.97.0)\n", 299 | "Requirement already satisfied: typing-extensions~=4.0 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from gradio) (4.5.0)\n", 300 | "Requirement already satisfied: gradio-client>=0.4.0 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from gradio) (0.4.0)\n", 301 | "Requirement already satisfied: requests~=2.0 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from gradio) (2.29.0)\n", 302 | "Requirement already satisfied: pydub in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from gradio) (0.25.1)\n", 303 | "Requirement already satisfied: pydantic!=1.8,!=1.8.1,!=2.0.0,!=2.0.1,<3.0.0,>=1.7.4 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from gradio) (1.10.8)\n", 304 | "Requirement already satisfied: multidict<7.0,>=4.5 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from aiohttp~=3.0->gradio) (6.0.2)\n", 305 | "Requirement already satisfied: aiosignal>=1.1.2 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from aiohttp~=3.0->gradio) (1.2.0)\n", 306 | "Requirement already satisfied: attrs>=17.3.0 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from aiohttp~=3.0->gradio) (22.1.0)\n", 307 | "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from aiohttp~=3.0->gradio) (4.0.2)\n", 308 | "Requirement already satisfied: yarl<2.0,>=1.0 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from aiohttp~=3.0->gradio) (1.8.1)\n", 309 | "Requirement already satisfied: charset-normalizer<4.0,>=2.0 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from aiohttp~=3.0->gradio) (2.0.4)\n", 310 | "Requirement already satisfied: frozenlist>=1.1.1 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from aiohttp~=3.0->gradio) (1.3.3)\n", 311 | "Requirement already satisfied: toolz in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from altair<6.0,>=4.2.0->gradio) (0.12.0)\n", 312 | "Requirement already satisfied: jsonschema>=3.0 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from altair<6.0,>=4.2.0->gradio) (4.17.3)\n", 313 | "Requirement already satisfied: fsspec in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from gradio-client>=0.4.0->gradio) (2023.4.0)\n", 314 | "Requirement already satisfied: filelock in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from huggingface-hub>=0.14.0->gradio) (3.9.0)\n", 315 | "Requirement already satisfied: tqdm>=4.42.1 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from huggingface-hub>=0.14.0->gradio) (4.65.0)\n", 316 | "Requirement already satisfied: mdurl~=0.1 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from markdown-it-py[linkify]>=2.0.0->gradio) (0.1.2)\n", 317 | "Requirement already satisfied: linkify-it-py<3,>=1 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from markdown-it-py[linkify]>=2.0.0->gradio) (2.0.2)\n", 318 | "Requirement already satisfied: kiwisolver>=1.0.1 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from matplotlib~=3.0->gradio) (1.4.4)\n", 319 | "Requirement already satisfied: python-dateutil>=2.7 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from matplotlib~=3.0->gradio) (2.8.2)\n", 320 | "Requirement already satisfied: contourpy>=1.0.1 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from matplotlib~=3.0->gradio) (1.1.0)\n", 321 | "Requirement already satisfied: pyparsing<3.1,>=2.3.1 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from matplotlib~=3.0->gradio) (3.0.9)\n", 322 | "Requirement already satisfied: cycler>=0.10 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from matplotlib~=3.0->gradio) (0.11.0)\n", 323 | "Requirement already satisfied: fonttools>=4.22.0 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from matplotlib~=3.0->gradio) (4.42.1)\n", 324 | "Requirement already satisfied: pytz>=2020.1 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from pandas<3.0,>=1.0->gradio) (2022.7)\n", 325 | "Requirement already satisfied: certifi>=2017.4.17 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from requests~=2.0->gradio) (2023.5.7)\n", 326 | "Requirement already satisfied: urllib3<1.27,>=1.21.1 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from requests~=2.0->gradio) (1.26.16)\n", 327 | "Requirement already satisfied: idna<4,>=2.5 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from requests~=2.0->gradio) (3.4)\n", 328 | "Requirement already satisfied: click>=7.0 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from uvicorn>=0.14.0->gradio) (8.1.7)\n", 329 | "Requirement already satisfied: h11>=0.8 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from uvicorn>=0.14.0->gradio) (0.14.0)\n", 330 | "Requirement already satisfied: starlette<0.28.0,>=0.27.0 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from fastapi->gradio) (0.27.0)\n", 331 | "Requirement already satisfied: sniffio in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from httpx->gradio) (1.2.0)\n", 332 | "Requirement already satisfied: httpcore<0.18.0,>=0.15.0 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from httpx->gradio) (0.17.3)\n", 333 | "Requirement already satisfied: colorama in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from click>=7.0->uvicorn>=0.14.0->gradio) (0.4.6)\n", 334 | "Requirement already satisfied: anyio<5.0,>=3.0 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from httpcore<0.18.0,>=0.15.0->httpx->gradio) (3.5.0)\n", 335 | "Requirement already satisfied: pyrsistent!=0.17.0,!=0.17.1,!=0.17.2,>=0.14.0 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from jsonschema>=3.0->altair<6.0,>=4.2.0->gradio) (0.18.0)\n", 336 | "Requirement already satisfied: uc-micro-py in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from linkify-it-py<3,>=1->markdown-it-py[linkify]>=2.0.0->gradio) (1.0.2)\n", 337 | "Requirement already satisfied: six>=1.5 in c:\\users\\sourav\\anaconda3\\envs\\genai\\lib\\site-packages (from python-dateutil>=2.7->matplotlib~=3.0->gradio) (1.16.0)\n" 338 | ] 339 | } 340 | ], 341 | "source": [ 342 | "!pip install gradio" 343 | ] 344 | }, 345 | { 346 | "cell_type": "markdown", 347 | "id": "9c834111-bee0-47fc-b0ae-24a8b79142f5", 348 | "metadata": {}, 349 | "source": [ 350 | "### Reading PDF and creating vector store" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": 32, 356 | "id": "867bc8b1-bb7f-4e03-8a8c-0fa8f7c01c3a", 357 | "metadata": { 358 | "tags": [] 359 | }, 360 | "outputs": [ 361 | { 362 | "name": "stdout", 363 | "output_type": "stream", 364 | "text": [ 365 | "Enter PDF file path: pdf_files\\Linear_Regression.pdf\n" 366 | ] 367 | } 368 | ], 369 | "source": [ 370 | "path = input(\"Enter PDF file path: \")#\"C:/Users/Sourav/Downloads/Naïve Bayes.pdf\"\n", 371 | "loader = PyPDFLoader(path)\n", 372 | "pages = loader.load()" 373 | ] 374 | }, 375 | { 376 | "cell_type": "code", 377 | "execution_count": 33, 378 | "id": "ee346ca7-683c-4ba6-8f60-fe94c289f3a7", 379 | "metadata": { 380 | "tags": [] 381 | }, 382 | "outputs": [ 383 | { 384 | "data": { 385 | "text/plain": [ 386 | "6" 387 | ] 388 | }, 389 | "execution_count": 33, 390 | "metadata": {}, 391 | "output_type": "execute_result" 392 | } 393 | ], 394 | "source": [ 395 | "#number of pages\n", 396 | "len(pages)" 397 | ] 398 | }, 399 | { 400 | "cell_type": "code", 401 | "execution_count": 34, 402 | "id": "95bb624f-dbfb-40fe-9eb4-e8b4d0a4c5be", 403 | "metadata": { 404 | "tags": [] 405 | }, 406 | "outputs": [ 407 | { 408 | "data": { 409 | "text/plain": [ 410 | "Document(page_content='Linear Regression \\nWhat is Linear Regression? \\n\\uf0b7 Models the relationship between two variables. \\n\\uf0b7 Mathematically: 𝑦=𝑚𝑥+𝑐 \\n\\uf0b7 \\nLinear Regression in Machine Learning: \\n\\uf0b7 Based on Supervised Learning. \\n\\uf0b7 Models target prediction based on independent variables. \\n\\uf0b7 Here in the function , 𝑌=𝑀𝑋+𝐶 Y and X are vectors . \\n\\uf0b7 M and C known as coefficient of X and intercept respectively. \\n\\uf0b7 Best value of M and C gives the best model \\n\\uf0b7 And Cost function helps in estimating the best value. \\nWhy using Linear Regression ? \\n\\uf0b7 It is m ore versatile and has wide applicability. \\no It tells what features are statistically important or not. \\no It allows to understand the relation between different variables. \\no One can get the confidence inter val for each regression coefficient that it estimates. \\n\\uf0b7 It is quite simple and easy to interpret. \\n\\uf0b7 It gives better understanding of statistical inference and the overall model. \\nGeometrical Intu ition of Linear Regression: \\n\\uf0b7 Representation of 2 -d dataset in linear regression model : \\n', metadata={'source': 'pdf_files\\\\Linear_Regression.pdf', 'page': 0})" 411 | ] 412 | }, 413 | "execution_count": 34, 414 | "metadata": {}, 415 | "output_type": "execute_result" 416 | } 417 | ], 418 | "source": [ 419 | "pages[0]" 420 | ] 421 | }, 422 | { 423 | "cell_type": "code", 424 | "execution_count": 35, 425 | "id": "bf5a1b28-166b-4319-825c-8a165d6b7e6d", 426 | "metadata": { 427 | "tags": [] 428 | }, 429 | "outputs": [], 430 | "source": [ 431 | "splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)\n", 432 | "docs = splitter.split_documents(pages)" 433 | ] 434 | }, 435 | { 436 | "cell_type": "code", 437 | "execution_count": 37, 438 | "id": "b8669a5c-a046-4e2c-98b5-9a50967bb11a", 439 | "metadata": { 440 | "tags": [] 441 | }, 442 | "outputs": [ 443 | { 444 | "data": { 445 | "text/plain": [ 446 | "13" 447 | ] 448 | }, 449 | "execution_count": 37, 450 | "metadata": {}, 451 | "output_type": "execute_result" 452 | } 453 | ], 454 | "source": [ 455 | "len(docs)" 456 | ] 457 | }, 458 | { 459 | "cell_type": "code", 460 | "execution_count": 38, 461 | "id": "bee4bbba-76c7-4a5c-b829-b2a0893da806", 462 | "metadata": { 463 | "tags": [] 464 | }, 465 | "outputs": [ 466 | { 467 | "name": "stdout", 468 | "output_type": "stream", 469 | "text": [ 470 | "2023-08-22 13:35:21 - Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2\n", 471 | "2023-08-22 13:35:22 - Use pytorch device: cpu\n", 472 | "2023-08-22 13:35:22 - Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n" 473 | ] 474 | }, 475 | { 476 | "name": "stderr", 477 | "output_type": "stream", 478 | "text": [ 479 | "Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:05<00:00, 5.13s/it]\n" 480 | ] 481 | } 482 | ], 483 | "source": [ 484 | "embeddings = HuggingFaceEmbeddings()\n", 485 | "doc_search = Chroma.from_documents(docs, embeddings)" 486 | ] 487 | }, 488 | { 489 | "cell_type": "code", 490 | "execution_count": 39, 491 | "id": "e944a892-cd70-45fa-a24e-c562c3284926", 492 | "metadata": { 493 | "tags": [] 494 | }, 495 | "outputs": [ 496 | { 497 | "name": "stderr", 498 | "output_type": "stream", 499 | "text": [ 500 | "Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 16.82it/s]\n" 501 | ] 502 | } 503 | ], 504 | "source": [ 505 | "query = \"What is Linear Regression\"\n", 506 | "similar_docs = doc_search.similarity_search(query, k=3)" 507 | ] 508 | }, 509 | { 510 | "cell_type": "code", 511 | "execution_count": 40, 512 | "id": "96bfe073-4fed-442d-ba73-67277760b424", 513 | "metadata": { 514 | "collapsed": true, 515 | "jupyter": { 516 | "outputs_hidden": true 517 | }, 518 | "tags": [] 519 | }, 520 | "outputs": [ 521 | { 522 | "data": { 523 | "text/plain": [ 524 | "[Document(page_content='Linear Regression \\nWhat is Linear Regression? \\n\\uf0b7 Models the relationship between two variables. \\n\\uf0b7 Mathematically: 𝑦=𝑚𝑥+𝑐 \\n\\uf0b7 \\nLinear Regression in Machine Learning: \\n\\uf0b7 Based on Supervised Learning. \\n\\uf0b7 Models target prediction based on independent variables. \\n\\uf0b7 Here in the function , 𝑌=𝑀𝑋+𝐶 Y and X are vectors . \\n\\uf0b7 M and C known as coefficient of X and intercept respectively. \\n\\uf0b7 Best value of M and C gives the best model \\n\\uf0b7 And Cost function helps in estimating the best value.', metadata={'page': 0, 'source': 'pdf_files\\\\Linear_Regression.pdf'}),\n", 525 | " Document(page_content='\\uf0b7 Error can be defined as the difference between the actual value and the predicted value. \\n\\uf0b7 In Linear Regression, ideally the regressi on line is the line which minim izes the e rror across all \\npoints. \\n\\uf0b7 Cost function of Linear Regression could be given as: \\n \\nWhere “J” is the cost function, which calculates the average error in all predicted values , \\n“n” is the number of data points in the dataset, \\n“pred” is the predicted value, \\n and “y” is the actual value.', metadata={'page': 2, 'source': 'pdf_files\\\\Linear_Regression.pdf'}),\n", 526 | " Document(page_content='Learning Methods for Linear Regression: \\nThere are many methods to apply in Linear Regression model, but discussing here the most common \\nones: \\n\\uf0b7 Gradient Descent: \\no It slowly converge the prediction function towards a global minimum where the cost \\nfunction is lowest. \\no Learning rate(alpha) value is a critic al parameter here. \\no Works well with more number of features. \\n\\uf0b7 Least Square Method: \\nHere we solve directly for the value of the coefficients . \\n\\uf0b7 Adam s Method', metadata={'page': 2, 'source': 'pdf_files\\\\Linear_Regression.pdf'})]" 527 | ] 528 | }, 529 | "execution_count": 40, 530 | "metadata": {}, 531 | "output_type": "execute_result" 532 | } 533 | ], 534 | "source": [ 535 | "similar_docs" 536 | ] 537 | }, 538 | { 539 | "cell_type": "markdown", 540 | "id": "969f7c2b-d641-4f26-ab92-59843f27e892", 541 | "metadata": {}, 542 | "source": [ 543 | "### Creating a chain with LLM" 544 | ] 545 | }, 546 | { 547 | "cell_type": "code", 548 | "execution_count": 41, 549 | "id": "539f0e2f-211a-4a01-8d22-a6b266d786c6", 550 | "metadata": { 551 | "tags": [] 552 | }, 553 | "outputs": [], 554 | "source": [ 555 | "repo_id = \"tiiuae/falcon-7b\"\n", 556 | "llm = HuggingFaceHub(huggingfacehub_api_token = os.environ['HUGGING_FACE_HUB_API_KEY'], \n", 557 | " repo_id=repo_id, model_kwargs={'temperature': 0.2, 'max_length':1000}) \n", 558 | "\n", 559 | "#you can modify the parameters temperature and max length to generate different responses" 560 | ] 561 | }, 562 | { 563 | "cell_type": "code", 564 | "execution_count": null, 565 | "id": "0bd9a0dc-3ada-40c4-8639-6beb7f16eaab", 566 | "metadata": {}, 567 | "outputs": [], 568 | "source": [ 569 | " " 570 | ] 571 | }, 572 | { 573 | "cell_type": "code", 574 | "execution_count": 42, 575 | "id": "465b9bd7-90ef-44b2-914e-a8b8ff4c8e00", 576 | "metadata": { 577 | "tags": [] 578 | }, 579 | "outputs": [], 580 | "source": [ 581 | "retrieval_chain = RetrievalQA.from_chain_type(\n", 582 | " llm, \n", 583 | " chain_type='stuff', \n", 584 | " retriever=doc_search.as_retriever(),\n", 585 | " chain_type_kwargs={\n", 586 | " \"memory\":ConversationBufferMemory(\n", 587 | " \n", 588 | ")" 589 | ] 590 | }, 591 | { 592 | "cell_type": "code", 593 | "execution_count": 43, 594 | "id": "7efc2d98-3282-449c-a1f6-b0bae03e1de9", 595 | "metadata": { 596 | "tags": [] 597 | }, 598 | "outputs": [ 599 | { 600 | "name": "stderr", 601 | "output_type": "stream", 602 | "text": [ 603 | "Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 13.16it/s]\n" 604 | ] 605 | }, 606 | { 607 | "data": { 608 | "text/plain": [ 609 | "'\\n\\nLinear Regression is a mathematical model that is used to predict the value of a dependent variable'" 610 | ] 611 | }, 612 | "execution_count": 43, 613 | "metadata": {}, 614 | "output_type": "execute_result" 615 | } 616 | ], 617 | "source": [ 618 | "query = \"What is the mathematical formulation of Linear Regression\"\n", 619 | "retrieval_chain.run(query)" 620 | ] 621 | }, 622 | { 623 | "cell_type": "code", 624 | "execution_count": 44, 625 | "id": "633f2289-03fe-452b-961d-27f25c8cc024", 626 | "metadata": { 627 | "tags": [] 628 | }, 629 | "outputs": [ 630 | { 631 | "name": "stderr", 632 | "output_type": "stream", 633 | "text": [ 634 | "Batches: 100%|██████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 30.49it/s]\n" 635 | ] 636 | }, 637 | { 638 | "data": { 639 | "text/plain": [ 640 | "'\\n\\nThe first assumption is that the variables are independent.\\nThe second assumption is that the variables'" 641 | ] 642 | }, 643 | "execution_count": 44, 644 | "metadata": {}, 645 | "output_type": "execute_result" 646 | } 647 | ], 648 | "source": [ 649 | "query2 = \"assumptions\"\n", 650 | "retrieval_chain.run(query2)" 651 | ] 652 | }, 653 | { 654 | "cell_type": "markdown", 655 | "id": "6fd21f77-eefd-42ae-ae4d-07880e116cda", 656 | "metadata": {}, 657 | "source": [ 658 | "### Creating interface" 659 | ] 660 | }, 661 | { 662 | "cell_type": "code", 663 | "execution_count": 26, 664 | "id": "60cdae74-b66f-4057-b4d3-ea7b3a6c80f4", 665 | "metadata": { 666 | "tags": [] 667 | }, 668 | "outputs": [], 669 | "source": [ 670 | "import chainlit as cl" 671 | ] 672 | }, 673 | { 674 | "cell_type": "code", 675 | "execution_count": 27, 676 | "id": "43db761d-73b4-406a-ab38-fe68946cd76f", 677 | "metadata": { 678 | "tags": [] 679 | }, 680 | "outputs": [], 681 | "source": [ 682 | "@cl.on_chat_start\n", 683 | "def main():\n", 684 | " retrieval_chain = RetrievalQA.from_chain_type(llm, chain_type='stuff', retriever=doc_search.as_retriever())\n", 685 | " cl.user_session.set(\"retrieval_chain\", retrieval_chain)\n", 686 | " \n", 687 | "@cl.on_message\n", 688 | "async def main(message:str):\n", 689 | " retrieval_chain = cl.user_session.get(\"retrieval_chain\")\n", 690 | " res = await retrieval_chain.acall(message, callbacks=\n", 691 | " [cl.AsyncLangchainCallbackHandler()])\n", 692 | " \n", 693 | " await cl.Message(content=res[\"text\"]).send()" 694 | ] 695 | }, 696 | { 697 | "cell_type": "code", 698 | "execution_count": 29, 699 | "id": "e10f49aa-ba89-4c72-bc13-7d9888c9505f", 700 | "metadata": { 701 | "tags": [] 702 | }, 703 | "outputs": [ 704 | { 705 | "name": "stderr", 706 | "output_type": "stream", 707 | "text": [ 708 | "Usage: chainlit run [OPTIONS] TARGET\n", 709 | "Try 'chainlit run --help' for help.\n", 710 | "\n", 711 | "Error: Missing argument 'TARGET'.\n" 712 | ] 713 | } 714 | ], 715 | "source": [ 716 | "!chainlit run" 717 | ] 718 | }, 719 | { 720 | "cell_type": "code", 721 | "execution_count": 31, 722 | "id": "805077e7-7f53-4000-836f-ba603b2030ff", 723 | "metadata": { 724 | "tags": [] 725 | }, 726 | "outputs": [ 727 | { 728 | "name": "stdout", 729 | "output_type": "stream", 730 | "text": [ 731 | "Usage: chainlit run [OPTIONS] TARGET\n", 732 | "\n", 733 | "Options:\n", 734 | " -w, --watch Reload the app when the module changes\n", 735 | " -h, --headless Will prevent to auto open the app in the browser\n", 736 | " -d, --debug Set the log level to debug\n", 737 | " -c, --ci Flag to run in CI mode\n", 738 | " --no-cache Useful to disable third parties cache, such as\n", 739 | " langchain.\n", 740 | " --db [cloud|local] Useful to control database mode when running CI.\n", 741 | " --host TEXT Specify a different host to run the server on\n", 742 | " --port TEXT Specify a different port to run the server on\n", 743 | " --help Show this message and exit.\n" 744 | ] 745 | } 746 | ], 747 | "source": [ 748 | "!chainlit run --help" 749 | ] 750 | }, 751 | { 752 | "cell_type": "code", 753 | "execution_count": null, 754 | "id": "45f1cbb4-3b1e-4f17-a6ae-0afb25383f08", 755 | "metadata": {}, 756 | "outputs": [], 757 | "source": [] 758 | } 759 | ], 760 | "metadata": { 761 | "kernelspec": { 762 | "display_name": "Python 3 (ipykernel)", 763 | "language": "python", 764 | "name": "python3" 765 | }, 766 | "language_info": { 767 | "codemirror_mode": { 768 | "name": "ipython", 769 | "version": 3 770 | }, 771 | "file_extension": ".py", 772 | "mimetype": "text/x-python", 773 | "name": "python", 774 | "nbconvert_exporter": "python", 775 | "pygments_lexer": "ipython3", 776 | "version": "3.10.9" 777 | } 778 | }, 779 | "nbformat": 4, 780 | "nbformat_minor": 5 781 | } 782 | -------------------------------------------------------------------------------- /chatpdf.py: -------------------------------------------------------------------------------- 1 | import os 2 | import getpass 3 | 4 | from langchain.document_loaders import PyPDFLoader #document loader: https://python.langchain.com/docs/modules/data_connection/document_loaders 5 | from langchain.text_splitter import RecursiveCharacterTextSplitter #document transformer: text splitter for chunking 6 | from langchain.embeddings import HuggingFaceEmbeddings 7 | from langchain.vectorstores import Chroma #vector store 8 | from langchain import HuggingFaceHub #model hub 9 | from langchain.chains import RetrievalQA 10 | import chainlit as cl 11 | 12 | #loading the API key 13 | os.environ['HUGGINGFACEHUB_API_TOKEN'] = getpass.getpass('Hugging face api key:') 14 | 15 | path = input("Enter PDF file path: ") 16 | loader = PyPDFLoader(path) 17 | pages = loader.load() 18 | 19 | splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20) 20 | docs = splitter.split_documents(pages) 21 | 22 | embeddings = HuggingFaceEmbeddings() 23 | doc_search = Chroma.from_documents(docs, embeddings) 24 | 25 | repo_id = "tiiuae/falcon-7b" 26 | llm = HuggingFaceHub(repo_id=repo_id, model_kwargs={'temperature': 0.2, 'max_length':1000}) 27 | 28 | @cl.on_chat_start 29 | def main(): 30 | retrieval_chain = RetrievalQA.from_chain_type(llm, chain_type='stuff', retriever=doc_search.as_retriever()) 31 | cl.user_session.set("retrieval_chain", retrieval_chain) 32 | 33 | @cl.on_message 34 | async def main(message:str): 35 | retrieval_chain = cl.user_session.get("retrieval_chain") 36 | res = await retrieval_chain.acall(message, callbacks= 37 | [cl.AsyncLangchainCallbackHandler()]) 38 | 39 | #print(res) 40 | await cl.Message(content=res["result"]).send() --------------------------------------------------------------------------------