├── .gitignore
├── LICENSE
├── PersonalMemoryBot.py
├── README.md
└── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Avra
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/PersonalMemoryBot.py:
--------------------------------------------------------------------------------
  1 | # Import necessary modules
  2 | import re
  3 | import time
  4 | from io import BytesIO
  5 | from typing import Any, Dict, List
  6 | 
  7 | import openai
  8 | import streamlit as st
  9 | from langchain import LLMChain, OpenAI
 10 | from langchain.agents import AgentExecutor, Tool, ZeroShotAgent
 11 | from langchain.chains import RetrievalQA
 12 | from langchain.chains.question_answering import load_qa_chain
 13 | from langchain.docstore.document import Document
 14 | from langchain.document_loaders import PyPDFLoader
 15 | from langchain.embeddings.openai import OpenAIEmbeddings
 16 | from langchain.llms import OpenAI
 17 | from langchain.memory import ConversationBufferMemory
 18 | from langchain.text_splitter import RecursiveCharacterTextSplitter
 19 | from langchain.vectorstores import VectorStore
 20 | from langchain.vectorstores.faiss import FAISS
 21 | from pypdf import PdfReader
 22 | 
 23 | 
 24 | # Define a function to parse a PDF file and extract its text content
 25 | @st.cache_data
 26 | def parse_pdf(file: BytesIO) -> List[str]:
 27 |     pdf = PdfReader(file)
 28 |     output = []
 29 |     for page in pdf.pages:
 30 |         text = page.extract_text()
 31 |         # Merge hyphenated words
 32 |         text = re.sub(r"(\w+)-\n(\w+)", r"\1\2", text)
 33 |         # Fix newlines in the middle of sentences
 34 |         text = re.sub(r"(?<!\n\s)\n(?!\s\n)", " ", text.strip())
 35 |         # Remove multiple newlines
 36 |         text = re.sub(r"\n\s*\n", "\n\n", text)
 37 |         output.append(text)
 38 |     return output
 39 | 
 40 | 
 41 | # Define a function to convert text content to a list of documents
 42 | @st.cache_data
 43 | def text_to_docs(text: str) -> List[Document]:
 44 |     """Converts a string or list of strings to a list of Documents
 45 |     with metadata."""
 46 |     if isinstance(text, str):
 47 |         # Take a single string as one page
 48 |         text = [text]
 49 |     page_docs = [Document(page_content=page) for page in text]
 50 | 
 51 |     # Add page numbers as metadata
 52 |     for i, doc in enumerate(page_docs):
 53 |         doc.metadata["page"] = i + 1
 54 | 
 55 |     # Split pages into chunks
 56 |     doc_chunks = []
 57 | 
 58 |     for doc in page_docs:
 59 |         text_splitter = RecursiveCharacterTextSplitter(
 60 |             chunk_size=2000,
 61 |             separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
 62 |             chunk_overlap=0,
 63 |         )
 64 |         chunks = text_splitter.split_text(doc.page_content)
 65 |         for i, chunk in enumerate(chunks):
 66 |             doc = Document(
 67 |                 page_content=chunk, metadata={"page": doc.metadata["page"], "chunk": i}
 68 |             )
 69 |             # Add sources a metadata
 70 |             doc.metadata["source"] = f"{doc.metadata['page']}-{doc.metadata['chunk']}"
 71 |             doc_chunks.append(doc)
 72 |     return doc_chunks
 73 | 
 74 | 
 75 | # Define a function for the embeddings
 76 | @st.cache_data
 77 | def test_embed():
 78 |     embeddings = OpenAIEmbeddings(openai_api_key=api)
 79 |     # Indexing
 80 |     # Save in a Vector DB
 81 |     with st.spinner("It's indexing..."):
 82 |         index = FAISS.from_documents(pages, embeddings)
 83 |     st.success("Embeddings done.", icon="✅")
 84 |     return index
 85 | 
 86 | 
 87 | # Set up the Streamlit app
 88 | st.title("🤖 Personalized Bot with Memory 🧠 ")
 89 | st.markdown(
 90 |     """ 
 91 |         ####  🗨️ Chat with your PDF files 📜 with `Conversational Buffer Memory`  
 92 |         > *powered by [LangChain]('https://langchain.readthedocs.io/en/latest/modules/memory.html#memory') + 
 93 |         [OpenAI]('https://platform.openai.com/docs/models/gpt-3-5') + [DataButton](https://www.databutton.io/)*
 94 |         ----
 95 |         """
 96 | )
 97 | 
 98 | st.markdown(
 99 |     """
100 |     `openai`
101 |     `langchain`
102 |     `tiktoken`
103 |     `pypdf`
104 |     `faiss-cpu`
105 |     
106 |     ---------
107 |     """
108 | )
109 | 
110 | # Set up the sidebar
111 | st.sidebar.markdown(
112 |     """
113 |     ### Steps:
114 |     1. Upload PDF File
115 |     2. Enter Your Secret Key for Embeddings
116 |     3. Perform Q&A
117 | 
118 |     **Note : File content and API key not stored in any form.**
119 |     """
120 | )
121 | 
122 | # Allow the user to upload a PDF file
123 | uploaded_file = st.file_uploader("**Upload Your PDF File**", type=["pdf"])
124 | 
125 | if uploaded_file:
126 |     name_of_file = uploaded_file.name
127 |     doc = parse_pdf(uploaded_file)
128 |     pages = text_to_docs(doc)
129 |     if pages:
130 |         # Allow the user to select a page and view its content
131 |         with st.expander("Show Page Content", expanded=False):
132 |             page_sel = st.number_input(
133 |                 label="Select Page", min_value=1, max_value=len(pages), step=1
134 |             )
135 |             pages[page_sel - 1]
136 |         # Allow the user to enter an OpenAI API key
137 |         api = st.text_input(
138 |             "**Enter OpenAI API Key**",
139 |             type="password",
140 |             placeholder="sk-",
141 |             help="https://platform.openai.com/account/api-keys",
142 |         )
143 |         if api:
144 |             # Test the embeddings and save the index in a vector database
145 |             index = test_embed()
146 |             # Set up the question-answering system
147 |             qa = RetrievalQA.from_chain_type(
148 |                 llm=OpenAI(openai_api_key=api),
149 |                 chain_type = "map_reduce",
150 |                 retriever=index.as_retriever(),
151 |             )
152 |             # Set up the conversational agent
153 |             tools = [
154 |                 Tool(
155 |                     name="State of Union QA System",
156 |                     func=qa.run,
157 |                     description="Useful for when you need to answer questions about the aspects asked. Input may be a partial or fully formed question.",
158 |                 )
159 |             ]
160 |             prefix = """Have a conversation with a human, answering the following questions as best you can based on the context and memory available. 
161 |                         You have access to a single tool:"""
162 |             suffix = """Begin!"
163 | 
164 |             {chat_history}
165 |             Question: {input}
166 |             {agent_scratchpad}"""
167 | 
168 |             prompt = ZeroShotAgent.create_prompt(
169 |                 tools,
170 |                 prefix=prefix,
171 |                 suffix=suffix,
172 |                 input_variables=["input", "chat_history", "agent_scratchpad"],
173 |             )
174 | 
175 |             if "memory" not in st.session_state:
176 |                 st.session_state.memory = ConversationBufferMemory(
177 |                     memory_key="chat_history"
178 |                 )
179 | 
180 |             llm_chain = LLMChain(
181 |                 llm=OpenAI(
182 |                     temperature=0, openai_api_key=api, model_name="gpt-3.5-turbo"
183 |                 ),
184 |                 prompt=prompt,
185 |             )
186 |             agent = ZeroShotAgent(llm_chain=llm_chain, tools=tools, verbose=True)
187 |             agent_chain = AgentExecutor.from_agent_and_tools(
188 |                 agent=agent, tools=tools, verbose=True, memory=st.session_state.memory
189 |             )
190 | 
191 |             # Allow the user to enter a query and generate a response
192 |             query = st.text_input(
193 |                 "**What's on your mind?**",
194 |                 placeholder="Ask me anything from {}".format(name_of_file),
195 |             )
196 | 
197 |             if query:
198 |                 with st.spinner(
199 |                     "Generating Answer to your Query : `{}` ".format(query)
200 |                 ):
201 |                     res = agent_chain.run(query)
202 |                     st.info(res, icon="🤖")
203 | 
204 |             # Allow the user to view the conversation history and other information stored in the agent's memory
205 |             with st.expander("History/Memory"):
206 |                 st.session_state.memory
207 | 
208 | # Add a video and a link to a blog post in the sidebar
209 | with st.sidebar:
210 |     st.video("https://youtu.be/daMNGGPJkEE")
211 |     st.markdown("*Codes with a blog post will be available soon.*")
212 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # PersonalMemoryBot
 2 | Memory 🧠  to your Personal ChatBot 🤖|  LangChainAI and Databutton 
 3 | 
 4 | ### Note: You can directly start working on this app and customize it from [Databutton's](https://databutton.com/login?utm_source=github&utm_medium=avra&utm_article=personalmemorybot) App template and deploy :rocket: in no time!
 5 | 
 6 | ---
 7 | 
 8 | # Links 
 9 | 
10 | Demo App - [here](https://next.databutton.com/v/lgzxq112/Personalised_Memory_Bot)
11 | 
12 | Video - [here](https://youtu.be/daMNGGPJkEE)
13 | 
14 | Blog - [here](https://medium.com/@avra42/how-to-build-a-personalized-pdf-chat-bot-with-conversational-memory-965280c160f8)
15 | 
16 | 
17 | # Similar 
18 | 
19 | 🧠 Memory-Bot 🤖 - A chatbot 🤖 which remembers 🧠 using 🦜 LangChain 🔗 OpenAI | Streamlit | DataButton
20 | 
21 | Repo - [MemoryBot](https://github.com/avrabyt/MemoryBot)
22 | 
23 | The live demo app is hosted over [here](https://next.databutton.com/v/lgzxq112/Memory_Bot)
24 | 
25 | Blog - [here](https://medium.com/@avra42/how-to-build-a-chatbot-with-chatgpt-api-and-a-conversational-memory-in-python-8d856cda4542) 
26 | 
27 | Video - [here](https://youtu.be/cHjlperESbg)
28 | 
29 | 
30 | 
31 | # Resources:
32 | - LangChain docs - https://langchain.readthedocs.io/en/latest/index.html
33 | - Knowledge GPT : https://github.com/mmz-001/knowledge_gpt
34 | - LangChain Prompt Memory module: https://langchain.readthedocs.io/en/latest/modules/memory.html#memory
35 | - LangChain Repo : https://github.com/hwchase17/langchain
36 | - Databutton: https://www.databutton.io/
37 | - Databutton docs: https://docs.databutton.com/
38 | - Streamlit docs: https://docs.streamlit.io/
39 | 
40 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | langchain
2 | openai
3 | tiktoken
4 | faiss-cpu
5 | pypdf
6 | streamlit
7 | 


--------------------------------------------------------------------------------