├── .devcontainer └── devcontainer.json ├── .gitattributes ├── .github └── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md ├── .gitignore ├── .grit └── .gitignore ├── LICENSE ├── README.md ├── api.py ├── chat_history.json ├── chatbot.py ├── docs └── white_paper.pdf ├── example.env ├── huxley.py ├── huxleychat-home.png ├── huxleychat-how-it-works.png ├── huxleychat-sidebar-apikey.png ├── huxleychat_banner.png ├── requirements.txt ├── templates ├── condense_prompt.py └── qa_prompt.py └── utils ├── ingest.py └── query.py /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Python 3", 3 | // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile 4 | "image": "mcr.microsoft.com/devcontainers/python:1-3.11-bullseye", 5 | "customizations": { 6 | "codespaces": { 7 | "openFiles": [ 8 | "README.md", 9 | "huxley.py" 10 | ] 11 | }, 12 | "vscode": { 13 | "settings": {}, 14 | "extensions": [ 15 | "ms-python.python", 16 | "ms-python.vscode-pylance" 17 | ] 18 | } 19 | }, 20 | "updateContentCommand": "[ -f packages.txt ] && sudo apt update && sudo apt upgrade -y && sudo xargs apt install -y ), and [OpenAI]() and made by " 100 | "[@fredsiika]()." 101 | "\n\n" 102 | "View Source Code on [Github]()" 103 | )) 104 | with col2: 105 | st.image(image='huxleychat_banner.png', width=300, caption='Tutorial and accompanying documentation coming soon.') 106 | # End Top Information 107 | return 108 | 109 | # Function to set up the environment 110 | def setup_environment(): 111 | print('Setting up environment') 112 | # connect_to_pinecone(index) 113 | 114 | def connect_to_pinecone(index_name): 115 | """Connect to Pinecone and return the index.""" 116 | 117 | # find API key in console at app.pinecone.io 118 | PINECONE_API_KEY = os.getenv('PINECONE_API_KEY') # or 'PINECONE_API_KEY' 119 | # find ENV (cloud region) next to API key in console 120 | PINECONE_ENVIRONMENT = os.getenv('PINECONE_ENVIRONMENT') # or 'PINECONE_ENVIRONMENT' 121 | 122 | openai_model= 'gpt-3.5-turbo' 123 | temperature = 0.5 124 | 125 | # initialize pinecone 126 | pinecone.init( 127 | api_key=PINECONE_API_KEY, # find at app.pinecone.io 128 | environment=PINECONE_ENVIRONMENT # next to api key in console 129 | ) 130 | 131 | model = ChatOpenAI( 132 | model_name=openai_model, 133 | temperature=temperature, 134 | openai_api_key=os.getenv("OPENAI_API_KEY"), 135 | streaming=False 136 | ) # max temperature is 2 least is 0 137 | 138 | # only create index if it doesn't exist 139 | if index_name not in pinecone.list_indexes(): 140 | pinecone.create_index( 141 | name=index_name, 142 | dimension=model.get_sentence_embedding_dimension(), 143 | metric='cosine' 144 | ) 145 | 146 | # now connect to the index 147 | print(f"Connecting to Pinecone..\nindex_name: {index_name}") 148 | index = pinecone.GRPCIndex(index_name) 149 | 150 | # wait a moment for the index to be fully initialized 151 | time.sleep(1) 152 | 153 | loader = PyMuPDFLoader("./docs/white_paper.pdf") 154 | documents = loader.load() 155 | text_splitter = CharacterTextSplitter(chunk_size=400, chunk_overlap=20) 156 | docs = text_splitter.split_documents(documents) 157 | embeddings = OpenAIEmbeddings() 158 | 159 | # if you already have an index, you can load it like this 160 | docsearch = Pinecone.from_documents(docs, embeddings, index_name=index_name) 161 | query = "Why did the chicken cross the road?" 162 | docs = docsearch.similarity_search(query) 163 | print(f'\n{docs[0].page_content}\n') 164 | 165 | # print(f"\nClients connected to Pinecone index {index_name} \n{index.describe_index_stats()}\n") 166 | return index.describe_index_stats() 167 | 168 | def clear_submit(): 169 | st.session_state["submit"] = False 170 | 171 | def sidebar(): 172 | with st.sidebar: 173 | st.markdown('''## About HuxleyPDF''') 174 | st.markdown(''' 175 | HuxleyPDF is a Python application that allows you to upload a PDF and ask questions about it using natural language. 176 | 177 | ## How it works: 178 | 179 | Upload personal docs and Chat with your PDF files with this GPT4-powered app. 180 | Built with [LangChain](https://docs.langchain.com/docs/), [Pinecone Vector Db](https://pinecone.io/), deployed on [Streamlit](https://streamlit.io) 181 | 182 | ## How to use: 183 | 184 | 1. Upload a PDF 185 | 2. Ask a question about the PDF 186 | 3. Get an answer about the PDF 187 | 4. Repeat 188 | 189 | ## Before you start using HuxleyPDF: 190 | 191 | - You need to have an OpenAI API key. You can get one [here](https://api.openai.com/). 192 | - You need to have a Pinecone API key. You can get one [here](https://www.pinecone.io/). 193 | - You need to have a Pinecone environment. You can create one [here](https://www.pinecone.io/). 194 | 195 | ## How to obtain your OpenAI API key: 196 | 197 | 1. Sign in to your OpenAI account. If you do not have an account, [click here](https://platform.openai.com/signup) to sign up. 198 | 199 | 2. Visit the [OpenAI API keys page.](https://platform.openai.com/account/api-keys) 200 | open-key-create 201 | 202 | ![Step 1 and 2 Create an API Key Screenshot](https://www.usechatgpt.ai/assets/chrome-extension/open-key-create.png) 203 | 204 | 3. Create a new secret key and copy & paste it into the "API key" input field below.👇🏾 205 | ''') 206 | 207 | st.markdown(''' 208 | ## OpenAI API key 209 | 210 | **Tips:** 211 | 212 | - The official OpenAI API is more stable than the ChatGPT free plan. However, charges based on usage do apply. 213 | - Your API Key is saved locally on your browser and not transmitted anywhere else. 214 | - If you provide an API key enabled with GPT-4, the extension will support GPT-4. 215 | - Your free OpenAI API key could expire at some point, therefore please check [the expiration status of your API key here.](https://platform.openai.com/account/usage) 216 | - Access to ChatGPT may be unstable when demand is high for free OpenAI API key. 217 | 218 | ''') 219 | add_vertical_space(5) 220 | st.write('[HuxleyPDF](https://github.com/fredsiika/huxley-pdf) was made with ❤️ by [Fred](https://github.com/fredsiika)') 221 | 222 | st.write( 223 | "openai_api_key set: ", 224 | check_openai_api_key() 225 | # f'{True}' if os.environ.get('OPENAI_API_KEY') else f'{False}' 226 | ) 227 | st.write( 228 | "pinecone_api set: ", 229 | check_pinecone_api_key() 230 | # True if os.environ.get('PINECONE_API_KEY') == st.secrets['PINECONE_API_KEY'] else False 231 | ) 232 | st.write( 233 | "pinecone_index set set:", 234 | check_pinecone_index() 235 | # os.environ.get('PINECONE_INDEX') == st.secrets['PINECONE_INDEX'], 236 | ) 237 | st.write( 238 | 'pinecone_namespace set: ', 239 | check_pinecone_namespace() 240 | # os.environ.get('PINECONE_NAMESPACE') == st.secrets['PINECONE_NAMESPACE'], 241 | ) 242 | # st.write( 243 | # "pinecone_environment set: ", 244 | 245 | # # os.environ.get('PINECONE_ENVIRONMENT') == st.secrets['PINECONE_ENVIRONMENT'], 246 | # ) 247 | 248 | def upload_files(): 249 | uploaded_files = st.file_uploader( 250 | "Upload multiple files", 251 | type="pdf", 252 | help="docs, and txt files are still in beta.", 253 | accept_multiple_files=True, 254 | on_change=clear_submit 255 | ) 256 | 257 | if uploaded_files is None: 258 | st.info("Please upload a file of type: " + ", ".join(["pdf"])) 259 | return uploaded_files 260 | 261 | # To get the tokenizer corresponding to a specific model in the OpenAI API: 262 | tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo') # specific tiktoken encoder which is used by gpt-3.5-turbo: https://github.com/openai/tiktoken/blob/main/tiktoken/model.py#L74 263 | 264 | def tiktoken_len(text): 265 | """Returns the length of the text in tokens.""" 266 | tokens = tokenizer.encode( 267 | text, 268 | disallowed_special=() 269 | ) 270 | return len(tokens) 271 | 272 | # Function to ingest the files 273 | def ingest_files(uploaded_files): 274 | # find API key in console at app.pinecone.io 275 | PINECONE_API_KEY = os.getenv('PINECONE_API_KEY') # or 'PINECONE_API_KEY' 276 | # find ENV (cloud region) next to API key in console 277 | PINECONE_ENVIRONMENT = os.getenv('PINECONE_ENVIRONMENT') # or 'PINECONE_ENVIRONMENT' 278 | 279 | try: 280 | with st.spinner("Indexing documents... this might take a while⏳"): 281 | # Code to ingest the files goes here... 282 | with tempfile.TemporaryDirectory() as tmpdir: 283 | for uploaded_file in uploaded_files: 284 | file_name = uploaded_file.name 285 | file_content = uploaded_file.read() 286 | st.write("Filename: ", file_name) 287 | with open(os.path.join(tmpdir, file_name), "wb") as file: 288 | file.write(file_content) 289 | loader = DirectoryLoader(tmpdir, glob="**/*.pdf", loader_cls=PyMuPDFLoader) # type: ignore 290 | documents = loader.load() 291 | text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100, length_function=tiktoken_len) 292 | documents = text_splitter.split_documents(documents) 293 | pinecone.init( 294 | api_key=PINECONE_API_KEY, # find at app.pinecone.io 295 | environment=PINECONE_ENVIRONMENT # next to api key in console 296 | ) 297 | openai_api_key = os.getenv('OPENAI_API_KEY') 298 | embeddings = OpenAIEmbeddings(model='text-embedding-ada-002', openai_api_key=openai_api_key, client=None) 299 | # Pinecone.from_documents(documents, embeddings, index_name=index_name, namespace='ns1') 300 | Pinecone.from_existing_index(index_name='huxleypdf', embedding=embeddings, namespace='ns1') 301 | st.success("Ingested File!") 302 | st.session_state["api_key_configured"] = True 303 | except Exception as e: 304 | st.error(f"Error while ingesting the files: {str(e)}") 305 | return None 306 | 307 | # Function to display PDF as image on mobile devices 308 | def show_pdf_as_image(pdf_bytes): 309 | images = convert_from_bytes(pdf_bytes) 310 | for image in images: 311 | st.image(image) 312 | 313 | # Function to display PDF as iFrame on desktop 314 | def show_pdf_as_iframe(file): 315 | if file is not None: 316 | pdf_bytes = file.read() 317 | base64_pdf = base64.b64encode(pdf_bytes).decode('utf-8') 318 | pdf_display = f'' 319 | st.markdown(pdf_display, unsafe_allow_html=True) 320 | 321 | pdf_reader = PdfReader(file) 322 | 323 | def main(): 324 | render_header() 325 | sidebar() 326 | # setup_environment() 327 | 328 | 329 | # Upload file 330 | pdf = st.file_uploader("Upload your PDF", type="pdf") 331 | 332 | # Fetching remote PDFs using Unstructured 333 | # loader = OnlinePDFLoader("https://arxiv.org/pdf/2302.03803.pdf") 334 | # data = loader.load() 335 | # print(data) 336 | 337 | # extract the text 338 | if pdf is not None: 339 | pdf_reader = PdfReader(pdf) 340 | text = "" 341 | for page in pdf_reader.pages: 342 | text += page.extract_text() 343 | 344 | # Split into chunks 345 | text_splitter = CharacterTextSplitter( 346 | separator="\n", 347 | chunk_size=400, 348 | chunk_overlap=80, # I usually set chunk_overlap == 20% of chunk_size 349 | length_function=len 350 | ) 351 | chunks = text_splitter.split_text(text) 352 | 353 | # create embeddings 354 | embeddings = OpenAIEmbeddings() 355 | 356 | #TODO: render image of pdf 357 | # show_pdf_as_iframe(pdf) 358 | 359 | knowledge_base = Pinecone.from_existing_index(index_name='huxleypdf', embedding=embeddings, namespace='ns1') 360 | 361 | # show user input 362 | user_question = st.text_input("Ask a question about your PDF: ") 363 | if user_question: 364 | docs = knowledge_base.similarity_search(user_question) 365 | llm = OpenAI() 366 | chain = load_qa_chain(llm, chain_type="stuff") 367 | with get_openai_callback() as cb: 368 | response = chain.run(input_documents=docs, question=user_question) 369 | print(cb) 370 | 371 | st.write(response) 372 | 373 | #TODO: Add error handling 374 | 375 | if __name__ == '__main__': 376 | main() 377 | -------------------------------------------------------------------------------- /huxleychat-home.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fredsiika/huxley-pdf/02c82e78f189e55453393f84f2c5bff6967ee77e/huxleychat-home.png -------------------------------------------------------------------------------- /huxleychat-how-it-works.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fredsiika/huxley-pdf/02c82e78f189e55453393f84f2c5bff6967ee77e/huxleychat-how-it-works.png -------------------------------------------------------------------------------- /huxleychat-sidebar-apikey.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fredsiika/huxley-pdf/02c82e78f189e55453393f84f2c5bff6967ee77e/huxleychat-sidebar-apikey.png -------------------------------------------------------------------------------- /huxleychat_banner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fredsiika/huxley-pdf/02c82e78f189e55453393f84f2c5bff6967ee77e/huxleychat_banner.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pydeck 2 | lz4 3 | certifi 4 | gunicorn 5 | anyio 6 | pyrsistent 7 | rich 8 | pymongo 9 | tiktoken 10 | asgiref 11 | websockets 12 | packaging 13 | toml 14 | backoff 15 | colorama 16 | djangorestframework 17 | zipp 18 | jmespath 19 | tenacity 20 | protobuf 21 | psycopg2-binary 22 | Pympler 23 | greenlet 24 | pandas 25 | clickhouse-connect 26 | fastapi 27 | six 28 | zstandard 29 | numpy 30 | uvicorn 31 | openapi-schema-pydantic 32 | markdown-it-py 33 | boto3 34 | mpmath 35 | Pillow 36 | dnspython 37 | watchdog 38 | PyYAML 39 | smmap 40 | Pygments 41 | s3transfer 42 | botocore 43 | pytz 44 | regex 45 | django-dotenv 46 | posthog 47 | scikit-learn 48 | h11 49 | requests 50 | typing_extensions 51 | jsonschema 52 | python-dotenv 53 | typing-inspect 54 | httptools 55 | aiosignal 56 | dataclasses-json 57 | sentry-sdk 58 | tzdata 59 | importlib-metadata 60 | pyarrow 61 | validators 62 | idna 63 | pinecone-client 64 | djangorestframework-simplejwt 65 | sentence-transformers 66 | torch 67 | mdurl 68 | scipy 69 | tzlocal 70 | gitdb 71 | hnswlib 72 | urllib3 73 | altair 74 | frozenlist 75 | threadpoolctl 76 | yarl 77 | multidict 78 | pytz-deprecation-shim 79 | starlette 80 | fsspec 81 | Jinja2 82 | async-timeout 83 | marshmallow 84 | aiohttp 85 | attrs 86 | django-ninja 87 | nltk 88 | PyMuPDF 89 | sqlparse 90 | joblib 91 | streamlit-extras 92 | tornado 93 | win32-setctime 94 | pydantic 95 | sentencepiece 96 | charset-normalizer 97 | cachetools 98 | click 99 | MarkupSafe 100 | PyJWT 101 | filelock 102 | entrypoints 103 | monotonic 104 | sympy 105 | python-dateutil 106 | tokenizers 107 | sniffio 108 | watchfiles 109 | openai 110 | django-cors-headers 111 | SQLAlchemy 112 | duckdb 113 | networkx 114 | mypy-extensions 115 | toolz 116 | streamlit 117 | Django 118 | blinker 119 | decorator 120 | GitPython 121 | tqdm 122 | torchvision 123 | PyPDF2 124 | langchain 125 | loguru 126 | transformers 127 | marshmallow-enum 128 | faiss-cpu 129 | pdf2image -------------------------------------------------------------------------------- /templates/condense_prompt.py: -------------------------------------------------------------------------------- 1 | CONDENSE_PROMPT = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question. 2 | 3 | Chat History: 4 | {chat_history} 5 | Follow Up Input: {question} 6 | Standalone question:""" -------------------------------------------------------------------------------- /templates/qa_prompt.py: -------------------------------------------------------------------------------- 1 | QA_PROMPT = """You are a helpful AI assistant named HuxleyPDF. Use the following pieces of context to answer the question at the end. 2 | If you don't know the answer, just say you don't know. DO NOT try to make up an answer. 3 | If the question is not related to the context, politely respond that you are tuned to only answer questions that are related to the context. 4 | Use as much detail when as possible when responding. 5 | 6 | {context} 7 | 8 | Question: {question} 9 | Helpful answer in markdown format:""" -------------------------------------------------------------------------------- /utils/ingest.py: -------------------------------------------------------------------------------- 1 | from langchain.embeddings.openai import OpenAIEmbeddings 2 | from langchain.document_loaders import DirectoryLoader, PyMuPDFLoader 3 | from langchain.vectorstores import Pinecone 4 | import pinecone 5 | from langchain.text_splitter import RecursiveCharacterTextSplitter 6 | from langchain.vectorstores import Chroma 7 | 8 | 9 | def ingest(openai_api_key, pinecone_api_key, pinecone_environment, pinecone_index, pinecone_namespace, use_pinecone): 10 | loader = DirectoryLoader('docs', glob="**/*.pdf", loader_cls=PyMuPDFLoader) 11 | documents = loader.load() 12 | 13 | text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100) 14 | documents = text_splitter.split_documents(documents) 15 | embeddings = OpenAIEmbeddings( 16 | model='text-embedding-ada-002', 17 | openai_api_key=openai_api_key 18 | ) 19 | model='text-embedding-ada-002' 20 | openai_api_key=openai_api_key 21 | 22 | if use_pinecone: 23 | pinecone.init( 24 | api_key=pinecone_api_key, # find at app.pinecone.io 25 | environment=pinecone_environment # next to api key in console 26 | ) 27 | 28 | Pinecone.from_documents( 29 | documents, embeddings, 30 | index_name=pinecone_index, 31 | namespace=pinecone_namespace 32 | ) 33 | 34 | return 'Finished Ingesting, stored at Pinecone' 35 | 36 | else: 37 | vectorstore = Chroma.from_documents( 38 | documents, 39 | embeddings, 40 | collection_name="my_collection", 41 | persist_directory="./vectorstore" 42 | ) 43 | 44 | return 'Finished Ingesting, stored at ./vectorstore' 45 | -------------------------------------------------------------------------------- /utils/query.py: -------------------------------------------------------------------------------- 1 | from langchain.embeddings.openai import OpenAIEmbeddings 2 | from langchain.chains import ConversationalRetrievalChain 3 | from langchain.chat_models import ChatOpenAI 4 | from langchain.vectorstores import Pinecone 5 | import pinecone 6 | from templates.qa_prompt import QA_PROMPT 7 | from templates.condense_prompt import CONDENSE_PROMPT 8 | from langchain.vectorstores import Chroma 9 | 10 | 11 | def query(openai_api_key, pinecone_api_key, pinecone_environment, pinecone_index, pinecone_namespace, temperature, sources, use_pinecone): 12 | embeddings = OpenAIEmbeddings( 13 | model='text-embedding-ada-002', openai_api_key=openai_api_key) 14 | 15 | if use_pinecone: 16 | pinecone.init(api_key=pinecone_api_key, 17 | environment=pinecone_environment) 18 | vectorstore = Pinecone.from_existing_index( 19 | index_name=pinecone_index, embedding=embeddings, text_key='text', namespace=pinecone_namespace) 20 | else: 21 | 22 | # Load in persisted database from disk 23 | persist_directory = "./vectorstore" 24 | vectorstore = Chroma( 25 | persist_directory=persist_directory, embedding_function=embeddings, collection_name="my_collection") 26 | 27 | model = ChatOpenAI(model_name='gpt-3.5-turbo', temperature=temperature, 28 | openai_api_key=openai_api_key, streaming=True) # max temperature is 2 least is 0 29 | retriever = vectorstore.as_retriever( 30 | search_kwargs={"k": sources}, 31 | qa_template=QA_PROMPT, 32 | question_generator_template=CONDENSE_PROMPT 33 | ) # 9 is the max sources 34 | 35 | qa = ConversationalRetrievalChain.from_llm( 36 | llm=model, 37 | retriever=retriever, 38 | return_source_documents=True 39 | ) 40 | return qa 41 | --------------------------------------------------------------------------------