├── .gitignore
├── README.md
├── app.py
└── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
1 | gpt4.pdf
2 | .env
3 | PDF_Chat.ipynb
4 | .ipynb_checkpoints
5 | *.pkl
6 | .DS_Store
7 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # PDFChat
 2 | 
 3 | The PDFChat app allows you to chat with your PDF files using the power of langchain, OpenAI Embeddings, and GPT3.5 in the backend. 
 4 | It uses Streamlit for the user interface.
 5 | 
 6 | ## Demo
 7 | 
 8 | https://user-images.githubusercontent.com/19832025/230705607-00e830c1-0181-49b6-ba92-4c9294b6cec3.mp4 
 9 | 
10 | ## Installation
11 | 
12 | To install and run the application, follow the instructions below:
13 | 
14 | 1. Clone the repository using Git:
15 | 
16 |    ```bash
17 |    git clone https://github.com/dotvignesh/PDFChat.git
18 |    ```
19 | 
20 | 2. Change into the repository directory:
21 | 
22 |    ```bash
23 |    cd PDFChat
24 |    ```
25 | 
26 | 3. Create a conda environment:
27 | 
28 |    ```bash
29 |    conda create --name pdfchat
30 |    ```
31 |    
32 | 4. Activate the new conda environment:
33 | 
34 |    ```bash
35 |    conda activate pdfchat
36 |    ```
37 |    
38 | 5. Install the required packages:
39 | 
40 |    ```bash
41 |    pip install -r requirements.txt
42 |    ```
43 | 
44 | 6. Get your OpenAI API Key by following these steps:
45 |    - Go to [OpenAI Website](https://platform.openai.com/account/api-keys)
46 |    - Create an account or log in
47 |    - Navigate to the "API Keys" section
48 |    - Click on the "Create new secret key" button (or use an existing one)
49 |    - Copy the API key
50 | 
51 | 7. Create a `.env` file in the root of the repository directory, and add the following line, replacing `<your-api-key>` with your actual API key:
52 | 
53 |    ```bash
54 |    OPENAI_API_KEY=<your-api-key>
55 |    ```
56 | 
57 | 8. Run the application using streamlit:
58 | 
59 |    ```bash
60 |    streamlit run app.py
61 |    ```
62 | 
63 | The application should now be running at http://localhost:8501.
64 | 
65 | 
66 | ## Usage
67 | 
68 | Once the app is running, you can upload your PDF files and start chatting with them using the built-in chat interface.
69 | 
70 | Enjoy chatting with your PDF files!
71 | 
72 | 


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
  1 | from PyPDF2 import PdfReader
  2 | from langchain.embeddings.openai import OpenAIEmbeddings
  3 | from langchain.text_splitter import RecursiveCharacterTextSplitter
  4 | from langchain.vectorstores import FAISS
  5 | from langchain.chains.question_answering import load_qa_chain
  6 | from langchain.chat_models import ChatOpenAI
  7 | from langchain.chains import ConversationalRetrievalChain
  8 | import pickle
  9 | from pathlib import Path
 10 | from dotenv import load_dotenv
 11 | import os
 12 | import streamlit as st
 13 | from streamlit_chat import message
 14 | import io
 15 | import asyncio
 16 | 
 17 | load_dotenv()
 18 | api_key = os.getenv('OPENAI_API_KEY')  
 19 | 
 20 | # vectors = getDocEmbeds("gpt4.pdf")
 21 | # qa = ChatVectorDBChain.from_llm(ChatOpenAI(model_name="gpt-3.5-turbo"), vectors, return_source_documents=True)
 22 | 
 23 | async def main():
 24 | 
 25 |     async def storeDocEmbeds(file, filename):
 26 |     
 27 |         reader = PdfReader(file)
 28 |         corpus = ''.join([p.extract_text() for p in reader.pages if p.extract_text()])
 29 |         
 30 |         splitter =  RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=200,)
 31 |         chunks = splitter.split_text(corpus)
 32 |         
 33 |         embeddings = OpenAIEmbeddings(openai_api_key = api_key)
 34 |         vectors = FAISS.from_texts(chunks, embeddings)
 35 |         
 36 |         with open(filename + ".pkl", "wb") as f:
 37 |             pickle.dump(vectors, f)
 38 | 
 39 |         
 40 |     async def getDocEmbeds(file, filename):
 41 |         
 42 |         if not os.path.isfile(filename + ".pkl"):
 43 |             await storeDocEmbeds(file, filename)
 44 |         
 45 |         with open(filename + ".pkl", "rb") as f:
 46 |             global vectores
 47 |             vectors = pickle.load(f)
 48 |             
 49 |         return vectors
 50 |     
 51 | 
 52 |     async def conversational_chat(query):
 53 |         result = qa({"question": query, "chat_history": st.session_state['history']})
 54 |         st.session_state['history'].append((query, result["answer"]))
 55 |         # print("Log: ")
 56 |         # print(st.session_state['history'])
 57 |         return result["answer"]
 58 | 
 59 | 
 60 |     llm = ChatOpenAI(model_name="gpt-3.5-turbo")
 61 |     chain = load_qa_chain(llm, chain_type="stuff")
 62 | 
 63 |     if 'history' not in st.session_state:
 64 |         st.session_state['history'] = []
 65 | 
 66 | 
 67 |     #Creating the chatbot interface
 68 |     st.title("PDFChat :")
 69 | 
 70 |     if 'ready' not in st.session_state:
 71 |         st.session_state['ready'] = False
 72 | 
 73 |     uploaded_file = st.file_uploader("Choose a file", type="pdf")
 74 | 
 75 |     if uploaded_file is not None:
 76 | 
 77 |         with st.spinner("Processing..."):
 78 |         # Add your code here that needs to be executed
 79 |             uploaded_file.seek(0)
 80 |             file = uploaded_file.read()
 81 |             # pdf = PyPDF2.PdfFileReader()
 82 |             vectors = await getDocEmbeds(io.BytesIO(file), uploaded_file.name)
 83 |             qa = ConversationalRetrievalChain.from_llm(ChatOpenAI(model_name="gpt-3.5-turbo"), retriever=vectors.as_retriever(), return_source_documents=True)
 84 | 
 85 |         st.session_state['ready'] = True
 86 | 
 87 |     st.divider()
 88 | 
 89 |     if st.session_state['ready']:
 90 | 
 91 |         if 'generated' not in st.session_state:
 92 |             st.session_state['generated'] = ["Welcome! You can now ask any questions regarding " + uploaded_file.name]
 93 | 
 94 |         if 'past' not in st.session_state:
 95 |             st.session_state['past'] = ["Hey!"]
 96 | 
 97 |         # container for chat history
 98 |         response_container = st.container()
 99 | 
100 |         # container for text box
101 |         container = st.container()
102 | 
103 |         with container:
104 |             with st.form(key='my_form', clear_on_submit=True):
105 |                 user_input = st.text_input("Query:", placeholder="e.g: Summarize the paper in a few sentences", key='input')
106 |                 submit_button = st.form_submit_button(label='Send')
107 | 
108 |             if submit_button and user_input:
109 |                 output = await conversational_chat(user_input)
110 |                 st.session_state['past'].append(user_input)
111 |                 st.session_state['generated'].append(output)
112 | 
113 |         if st.session_state['generated']:
114 |             with response_container:
115 |                 for i in range(len(st.session_state['generated'])):
116 |                     message(st.session_state["past"][i], is_user=True, key=str(i) + '_user', avatar_style="thumbs")
117 |                     message(st.session_state["generated"][i], key=str(i), avatar_style="fun-emoji")
118 | 
119 | 
120 | if __name__ == "__main__":
121 |     asyncio.run(main())


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
  1 | aiohttp==3.8.4
  2 | aiosignal==1.3.1
  3 | altair==4.2.2
  4 | anyio==3.6.2
  5 | appnope==0.1.3
  6 | argon2-cffi==21.3.0
  7 | argon2-cffi-bindings==21.2.0
  8 | arrow==1.2.3
  9 | asttokens==2.2.1
 10 | async-timeout==4.0.2
 11 | attrs==22.2.0
 12 | backcall==0.2.0
 13 | beautifulsoup4==4.12.1
 14 | bleach==6.0.0
 15 | blinker==1.6
 16 | cachetools==5.3.0
 17 | certifi==2022.12.7
 18 | cffi==1.15.1
 19 | charset-normalizer==3.1.0
 20 | click==8.1.3
 21 | comm==0.1.3
 22 | dataclasses-json==0.5.7
 23 | debugpy==1.6.7
 24 | decorator==5.1.1
 25 | defusedxml==0.7.1
 26 | entrypoints==0.4
 27 | executing==1.2.0
 28 | faiss-cpu==1.7.3
 29 | fastjsonschema==2.16.3
 30 | fqdn==1.5.1
 31 | frozenlist==1.3.3
 32 | gitdb==4.0.10
 33 | GitPython==3.1.31
 34 | greenlet==2.0.2
 35 | idna==3.4
 36 | importlib-metadata==6.1.0
 37 | ipykernel==6.22.0
 38 | ipython==8.12.0
 39 | ipython-genutils==0.2.0
 40 | isoduration==20.11.0
 41 | jedi==0.18.2
 42 | Jinja2==3.1.2
 43 | jsonpointer==2.3
 44 | jsonschema==4.17.3
 45 | jupyter_client==8.1.0
 46 | jupyter_core==5.3.0
 47 | jupyter-events==0.6.3
 48 | jupyter_server==2.5.0
 49 | jupyter_server_terminals==0.4.4
 50 | jupyterlab-pygments==0.2.2
 51 | langchain==0.0.133
 52 | markdown-it-py==2.2.0
 53 | MarkupSafe==2.1.2
 54 | marshmallow==3.19.0
 55 | marshmallow-enum==1.5.1
 56 | matplotlib-inline==0.1.6
 57 | mdurl==0.1.2
 58 | mistune==2.0.5
 59 | multidict==6.0.4
 60 | mypy-extensions==1.0.0
 61 | nbclassic==0.5.5
 62 | nbclient==0.7.3
 63 | nbconvert==7.3.0
 64 | nbformat==5.8.0
 65 | nest-asyncio==1.5.6
 66 | notebook==6.5.4
 67 | notebook_shim==0.2.2
 68 | numpy==1.24.2
 69 | openai==0.27.4
 70 | openapi-schema-pydantic==1.2.4
 71 | packaging==23.0
 72 | pandas==1.5.3
 73 | pandocfilters==1.5.0
 74 | parso==0.8.3
 75 | pexpect==4.8.0
 76 | pickleshare==0.7.5
 77 | Pillow==9.5.0
 78 | pip==23.0.1
 79 | platformdirs==3.2.0
 80 | prometheus-client==0.16.0
 81 | prompt-toolkit==3.0.38
 82 | protobuf==3.20.3
 83 | psutil==5.9.4
 84 | ptyprocess==0.7.0
 85 | pure-eval==0.2.2
 86 | pyarrow==11.0.0
 87 | pycparser==2.21
 88 | pydantic==1.10.7
 89 | pydeck==0.8.0
 90 | Pygments==2.14.0
 91 | Pympler==1.0.1
 92 | PyPDF2==3.0.1
 93 | pyrsistent==0.19.3
 94 | python-dateutil==2.8.2
 95 | python-dotenv==1.0.0
 96 | python-json-logger==2.0.7
 97 | pytz==2023.3
 98 | pytz-deprecation-shim==0.1.0.post0
 99 | PyYAML==6.0
100 | pyzmq==25.0.2
101 | requests==2.28.2
102 | rfc3339-validator==0.1.4
103 | rfc3986-validator==0.1.1
104 | rich==13.3.3
105 | Send2Trash==1.8.0
106 | setuptools==65.6.3
107 | six==1.16.0
108 | smmap==5.0.0
109 | sniffio==1.3.0
110 | soupsieve==2.4
111 | SQLAlchemy==1.4.47
112 | stack-data==0.6.2
113 | streamlit==1.21.0
114 | streamlit-chat==0.0.2.2
115 | tenacity==8.2.2
116 | terminado==0.17.1
117 | tinycss2==1.2.1
118 | toml==0.10.2
119 | toolz==0.12.0
120 | tornado==6.2
121 | tqdm==4.65.0
122 | traitlets==5.9.0
123 | typing_extensions==4.5.0
124 | typing-inspect==0.8.0
125 | tzdata==2023.3
126 | tzlocal==4.3
127 | uri-template==1.2.0
128 | urllib3==1.26.15
129 | validators==0.20.0
130 | wcwidth==0.2.6
131 | webcolors==1.13
132 | webencodings==0.5.1
133 | websocket-client==1.5.1
134 | wheel==0.38.4
135 | yarl==1.8.2
136 | zipp==3.15.0
137 | 


--------------------------------------------------------------------------------