├── .gitignore ├── README.md ├── app.py └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | gpt4.pdf 2 | .env 3 | PDF_Chat.ipynb 4 | .ipynb_checkpoints 5 | *.pkl 6 | .DS_Store 7 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PDFChat 2 | 3 | The PDFChat app allows you to chat with your PDF files using the power of langchain, OpenAI Embeddings, and GPT3.5 in the backend. 4 | It uses Streamlit for the user interface. 5 | 6 | ## Demo 7 | 8 | https://user-images.githubusercontent.com/19832025/230705607-00e830c1-0181-49b6-ba92-4c9294b6cec3.mp4 9 | 10 | ## Installation 11 | 12 | To install and run the application, follow the instructions below: 13 | 14 | 1. Clone the repository using Git: 15 | 16 | ```bash 17 | git clone https://github.com/dotvignesh/PDFChat.git 18 | ``` 19 | 20 | 2. Change into the repository directory: 21 | 22 | ```bash 23 | cd PDFChat 24 | ``` 25 | 26 | 3. Create a conda environment: 27 | 28 | ```bash 29 | conda create --name pdfchat 30 | ``` 31 | 32 | 4. Activate the new conda environment: 33 | 34 | ```bash 35 | conda activate pdfchat 36 | ``` 37 | 38 | 5. Install the required packages: 39 | 40 | ```bash 41 | pip install -r requirements.txt 42 | ``` 43 | 44 | 6. Get your OpenAI API Key by following these steps: 45 | - Go to [OpenAI Website](https://platform.openai.com/account/api-keys) 46 | - Create an account or log in 47 | - Navigate to the "API Keys" section 48 | - Click on the "Create new secret key" button (or use an existing one) 49 | - Copy the API key 50 | 51 | 7. Create a `.env` file in the root of the repository directory, and add the following line, replacing `` with your actual API key: 52 | 53 | ```bash 54 | OPENAI_API_KEY= 55 | ``` 56 | 57 | 8. Run the application using streamlit: 58 | 59 | ```bash 60 | streamlit run app.py 61 | ``` 62 | 63 | The application should now be running at http://localhost:8501. 64 | 65 | 66 | ## Usage 67 | 68 | Once the app is running, you can upload your PDF files and start chatting with them using the built-in chat interface. 69 | 70 | Enjoy chatting with your PDF files! 71 | 72 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | from PyPDF2 import PdfReader 2 | from langchain.embeddings.openai import OpenAIEmbeddings 3 | from langchain.text_splitter import RecursiveCharacterTextSplitter 4 | from langchain.vectorstores import FAISS 5 | from langchain.chains.question_answering import load_qa_chain 6 | from langchain.chat_models import ChatOpenAI 7 | from langchain.chains import ConversationalRetrievalChain 8 | import pickle 9 | from pathlib import Path 10 | from dotenv import load_dotenv 11 | import os 12 | import streamlit as st 13 | from streamlit_chat import message 14 | import io 15 | import asyncio 16 | 17 | load_dotenv() 18 | api_key = os.getenv('OPENAI_API_KEY') 19 | 20 | # vectors = getDocEmbeds("gpt4.pdf") 21 | # qa = ChatVectorDBChain.from_llm(ChatOpenAI(model_name="gpt-3.5-turbo"), vectors, return_source_documents=True) 22 | 23 | async def main(): 24 | 25 | async def storeDocEmbeds(file, filename): 26 | 27 | reader = PdfReader(file) 28 | corpus = ''.join([p.extract_text() for p in reader.pages if p.extract_text()]) 29 | 30 | splitter = RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=200,) 31 | chunks = splitter.split_text(corpus) 32 | 33 | embeddings = OpenAIEmbeddings(openai_api_key = api_key) 34 | vectors = FAISS.from_texts(chunks, embeddings) 35 | 36 | with open(filename + ".pkl", "wb") as f: 37 | pickle.dump(vectors, f) 38 | 39 | 40 | async def getDocEmbeds(file, filename): 41 | 42 | if not os.path.isfile(filename + ".pkl"): 43 | await storeDocEmbeds(file, filename) 44 | 45 | with open(filename + ".pkl", "rb") as f: 46 | global vectores 47 | vectors = pickle.load(f) 48 | 49 | return vectors 50 | 51 | 52 | async def conversational_chat(query): 53 | result = qa({"question": query, "chat_history": st.session_state['history']}) 54 | st.session_state['history'].append((query, result["answer"])) 55 | # print("Log: ") 56 | # print(st.session_state['history']) 57 | return result["answer"] 58 | 59 | 60 | llm = ChatOpenAI(model_name="gpt-3.5-turbo") 61 | chain = load_qa_chain(llm, chain_type="stuff") 62 | 63 | if 'history' not in st.session_state: 64 | st.session_state['history'] = [] 65 | 66 | 67 | #Creating the chatbot interface 68 | st.title("PDFChat :") 69 | 70 | if 'ready' not in st.session_state: 71 | st.session_state['ready'] = False 72 | 73 | uploaded_file = st.file_uploader("Choose a file", type="pdf") 74 | 75 | if uploaded_file is not None: 76 | 77 | with st.spinner("Processing..."): 78 | # Add your code here that needs to be executed 79 | uploaded_file.seek(0) 80 | file = uploaded_file.read() 81 | # pdf = PyPDF2.PdfFileReader() 82 | vectors = await getDocEmbeds(io.BytesIO(file), uploaded_file.name) 83 | qa = ConversationalRetrievalChain.from_llm(ChatOpenAI(model_name="gpt-3.5-turbo"), retriever=vectors.as_retriever(), return_source_documents=True) 84 | 85 | st.session_state['ready'] = True 86 | 87 | st.divider() 88 | 89 | if st.session_state['ready']: 90 | 91 | if 'generated' not in st.session_state: 92 | st.session_state['generated'] = ["Welcome! You can now ask any questions regarding " + uploaded_file.name] 93 | 94 | if 'past' not in st.session_state: 95 | st.session_state['past'] = ["Hey!"] 96 | 97 | # container for chat history 98 | response_container = st.container() 99 | 100 | # container for text box 101 | container = st.container() 102 | 103 | with container: 104 | with st.form(key='my_form', clear_on_submit=True): 105 | user_input = st.text_input("Query:", placeholder="e.g: Summarize the paper in a few sentences", key='input') 106 | submit_button = st.form_submit_button(label='Send') 107 | 108 | if submit_button and user_input: 109 | output = await conversational_chat(user_input) 110 | st.session_state['past'].append(user_input) 111 | st.session_state['generated'].append(output) 112 | 113 | if st.session_state['generated']: 114 | with response_container: 115 | for i in range(len(st.session_state['generated'])): 116 | message(st.session_state["past"][i], is_user=True, key=str(i) + '_user', avatar_style="thumbs") 117 | message(st.session_state["generated"][i], key=str(i), avatar_style="fun-emoji") 118 | 119 | 120 | if __name__ == "__main__": 121 | asyncio.run(main()) -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.8.4 2 | aiosignal==1.3.1 3 | altair==4.2.2 4 | anyio==3.6.2 5 | appnope==0.1.3 6 | argon2-cffi==21.3.0 7 | argon2-cffi-bindings==21.2.0 8 | arrow==1.2.3 9 | asttokens==2.2.1 10 | async-timeout==4.0.2 11 | attrs==22.2.0 12 | backcall==0.2.0 13 | beautifulsoup4==4.12.1 14 | bleach==6.0.0 15 | blinker==1.6 16 | cachetools==5.3.0 17 | certifi==2022.12.7 18 | cffi==1.15.1 19 | charset-normalizer==3.1.0 20 | click==8.1.3 21 | comm==0.1.3 22 | dataclasses-json==0.5.7 23 | debugpy==1.6.7 24 | decorator==5.1.1 25 | defusedxml==0.7.1 26 | entrypoints==0.4 27 | executing==1.2.0 28 | faiss-cpu==1.7.3 29 | fastjsonschema==2.16.3 30 | fqdn==1.5.1 31 | frozenlist==1.3.3 32 | gitdb==4.0.10 33 | GitPython==3.1.31 34 | greenlet==2.0.2 35 | idna==3.4 36 | importlib-metadata==6.1.0 37 | ipykernel==6.22.0 38 | ipython==8.12.0 39 | ipython-genutils==0.2.0 40 | isoduration==20.11.0 41 | jedi==0.18.2 42 | Jinja2==3.1.2 43 | jsonpointer==2.3 44 | jsonschema==4.17.3 45 | jupyter_client==8.1.0 46 | jupyter_core==5.3.0 47 | jupyter-events==0.6.3 48 | jupyter_server==2.5.0 49 | jupyter_server_terminals==0.4.4 50 | jupyterlab-pygments==0.2.2 51 | langchain==0.0.133 52 | markdown-it-py==2.2.0 53 | MarkupSafe==2.1.2 54 | marshmallow==3.19.0 55 | marshmallow-enum==1.5.1 56 | matplotlib-inline==0.1.6 57 | mdurl==0.1.2 58 | mistune==2.0.5 59 | multidict==6.0.4 60 | mypy-extensions==1.0.0 61 | nbclassic==0.5.5 62 | nbclient==0.7.3 63 | nbconvert==7.3.0 64 | nbformat==5.8.0 65 | nest-asyncio==1.5.6 66 | notebook==6.5.4 67 | notebook_shim==0.2.2 68 | numpy==1.24.2 69 | openai==0.27.4 70 | openapi-schema-pydantic==1.2.4 71 | packaging==23.0 72 | pandas==1.5.3 73 | pandocfilters==1.5.0 74 | parso==0.8.3 75 | pexpect==4.8.0 76 | pickleshare==0.7.5 77 | Pillow==9.5.0 78 | pip==23.0.1 79 | platformdirs==3.2.0 80 | prometheus-client==0.16.0 81 | prompt-toolkit==3.0.38 82 | protobuf==3.20.3 83 | psutil==5.9.4 84 | ptyprocess==0.7.0 85 | pure-eval==0.2.2 86 | pyarrow==11.0.0 87 | pycparser==2.21 88 | pydantic==1.10.7 89 | pydeck==0.8.0 90 | Pygments==2.14.0 91 | Pympler==1.0.1 92 | PyPDF2==3.0.1 93 | pyrsistent==0.19.3 94 | python-dateutil==2.8.2 95 | python-dotenv==1.0.0 96 | python-json-logger==2.0.7 97 | pytz==2023.3 98 | pytz-deprecation-shim==0.1.0.post0 99 | PyYAML==6.0 100 | pyzmq==25.0.2 101 | requests==2.28.2 102 | rfc3339-validator==0.1.4 103 | rfc3986-validator==0.1.1 104 | rich==13.3.3 105 | Send2Trash==1.8.0 106 | setuptools==65.6.3 107 | six==1.16.0 108 | smmap==5.0.0 109 | sniffio==1.3.0 110 | soupsieve==2.4 111 | SQLAlchemy==1.4.47 112 | stack-data==0.6.2 113 | streamlit==1.21.0 114 | streamlit-chat==0.0.2.2 115 | tenacity==8.2.2 116 | terminado==0.17.1 117 | tinycss2==1.2.1 118 | toml==0.10.2 119 | toolz==0.12.0 120 | tornado==6.2 121 | tqdm==4.65.0 122 | traitlets==5.9.0 123 | typing_extensions==4.5.0 124 | typing-inspect==0.8.0 125 | tzdata==2023.3 126 | tzlocal==4.3 127 | uri-template==1.2.0 128 | urllib3==1.26.15 129 | validators==0.20.0 130 | wcwidth==0.2.6 131 | webcolors==1.13 132 | webencodings==0.5.1 133 | websocket-client==1.5.1 134 | wheel==0.38.4 135 | yarl==1.8.2 136 | zipp==3.15.0 137 | --------------------------------------------------------------------------------