├── .devcontainer
└── devcontainer.json
├── .gitattributes
├── .github
└── ISSUE_TEMPLATE
│ ├── bug_report.md
│ └── feature_request.md
├── .gitignore
├── .grit
└── .gitignore
├── LICENSE
├── README.md
├── api.py
├── chat_history.json
├── chatbot.py
├── docs
└── white_paper.pdf
├── example.env
├── huxley.py
├── huxleychat-home.png
├── huxleychat-how-it-works.png
├── huxleychat-sidebar-apikey.png
├── huxleychat_banner.png
├── requirements.txt
├── templates
├── condense_prompt.py
└── qa_prompt.py
└── utils
├── ingest.py
└── query.py
/.devcontainer/devcontainer.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "Python 3",
3 | // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
4 | "image": "mcr.microsoft.com/devcontainers/python:1-3.11-bullseye",
5 | "customizations": {
6 | "codespaces": {
7 | "openFiles": [
8 | "README.md",
9 | "huxley.py"
10 | ]
11 | },
12 | "vscode": {
13 | "settings": {},
14 | "extensions": [
15 | "ms-python.python",
16 | "ms-python.vscode-pylance"
17 | ]
18 | }
19 | },
20 | "updateContentCommand": "[ -f packages.txt ] && sudo apt update && sudo apt upgrade -y && sudo xargs apt install -y ), and [OpenAI]() and made by "
100 | "[@fredsiika]()."
101 | "\n\n"
102 | "View Source Code on [Github]()"
103 | ))
104 | with col2:
105 | st.image(image='huxleychat_banner.png', width=300, caption='Tutorial and accompanying documentation coming soon.')
106 | # End Top Information
107 | return
108 |
109 | # Function to set up the environment
110 | def setup_environment():
111 | print('Setting up environment')
112 | # connect_to_pinecone(index)
113 |
114 | def connect_to_pinecone(index_name):
115 | """Connect to Pinecone and return the index."""
116 |
117 | # find API key in console at app.pinecone.io
118 | PINECONE_API_KEY = os.getenv('PINECONE_API_KEY') # or 'PINECONE_API_KEY'
119 | # find ENV (cloud region) next to API key in console
120 | PINECONE_ENVIRONMENT = os.getenv('PINECONE_ENVIRONMENT') # or 'PINECONE_ENVIRONMENT'
121 |
122 | openai_model= 'gpt-3.5-turbo'
123 | temperature = 0.5
124 |
125 | # initialize pinecone
126 | pinecone.init(
127 | api_key=PINECONE_API_KEY, # find at app.pinecone.io
128 | environment=PINECONE_ENVIRONMENT # next to api key in console
129 | )
130 |
131 | model = ChatOpenAI(
132 | model_name=openai_model,
133 | temperature=temperature,
134 | openai_api_key=os.getenv("OPENAI_API_KEY"),
135 | streaming=False
136 | ) # max temperature is 2 least is 0
137 |
138 | # only create index if it doesn't exist
139 | if index_name not in pinecone.list_indexes():
140 | pinecone.create_index(
141 | name=index_name,
142 | dimension=model.get_sentence_embedding_dimension(),
143 | metric='cosine'
144 | )
145 |
146 | # now connect to the index
147 | print(f"Connecting to Pinecone..\nindex_name: {index_name}")
148 | index = pinecone.GRPCIndex(index_name)
149 |
150 | # wait a moment for the index to be fully initialized
151 | time.sleep(1)
152 |
153 | loader = PyMuPDFLoader("./docs/white_paper.pdf")
154 | documents = loader.load()
155 | text_splitter = CharacterTextSplitter(chunk_size=400, chunk_overlap=20)
156 | docs = text_splitter.split_documents(documents)
157 | embeddings = OpenAIEmbeddings()
158 |
159 | # if you already have an index, you can load it like this
160 | docsearch = Pinecone.from_documents(docs, embeddings, index_name=index_name)
161 | query = "Why did the chicken cross the road?"
162 | docs = docsearch.similarity_search(query)
163 | print(f'\n{docs[0].page_content}\n')
164 |
165 | # print(f"\nClients connected to Pinecone index {index_name} \n{index.describe_index_stats()}\n")
166 | return index.describe_index_stats()
167 |
168 | def clear_submit():
169 | st.session_state["submit"] = False
170 |
171 | def sidebar():
172 | with st.sidebar:
173 | st.markdown('''## About HuxleyPDF''')
174 | st.markdown('''
175 | HuxleyPDF is a Python application that allows you to upload a PDF and ask questions about it using natural language.
176 |
177 | ## How it works:
178 |
179 | Upload personal docs and Chat with your PDF files with this GPT4-powered app.
180 | Built with [LangChain](https://docs.langchain.com/docs/), [Pinecone Vector Db](https://pinecone.io/), deployed on [Streamlit](https://streamlit.io)
181 |
182 | ## How to use:
183 |
184 | 1. Upload a PDF
185 | 2. Ask a question about the PDF
186 | 3. Get an answer about the PDF
187 | 4. Repeat
188 |
189 | ## Before you start using HuxleyPDF:
190 |
191 | - You need to have an OpenAI API key. You can get one [here](https://api.openai.com/).
192 | - You need to have a Pinecone API key. You can get one [here](https://www.pinecone.io/).
193 | - You need to have a Pinecone environment. You can create one [here](https://www.pinecone.io/).
194 |
195 | ## How to obtain your OpenAI API key:
196 |
197 | 1. Sign in to your OpenAI account. If you do not have an account, [click here](https://platform.openai.com/signup) to sign up.
198 |
199 | 2. Visit the [OpenAI API keys page.](https://platform.openai.com/account/api-keys)
200 | open-key-create
201 |
202 | 
203 |
204 | 3. Create a new secret key and copy & paste it into the "API key" input field below.👇🏾
205 | ''')
206 |
207 | st.markdown('''
208 | ## OpenAI API key
209 |
210 | **Tips:**
211 |
212 | - The official OpenAI API is more stable than the ChatGPT free plan. However, charges based on usage do apply.
213 | - Your API Key is saved locally on your browser and not transmitted anywhere else.
214 | - If you provide an API key enabled with GPT-4, the extension will support GPT-4.
215 | - Your free OpenAI API key could expire at some point, therefore please check [the expiration status of your API key here.](https://platform.openai.com/account/usage)
216 | - Access to ChatGPT may be unstable when demand is high for free OpenAI API key.
217 |
218 | ''')
219 | add_vertical_space(5)
220 | st.write('[HuxleyPDF](https://github.com/fredsiika/huxley-pdf) was made with ❤️ by [Fred](https://github.com/fredsiika)')
221 |
222 | st.write(
223 | "openai_api_key set: ",
224 | check_openai_api_key()
225 | # f'{True}' if os.environ.get('OPENAI_API_KEY') else f'{False}'
226 | )
227 | st.write(
228 | "pinecone_api set: ",
229 | check_pinecone_api_key()
230 | # True if os.environ.get('PINECONE_API_KEY') == st.secrets['PINECONE_API_KEY'] else False
231 | )
232 | st.write(
233 | "pinecone_index set set:",
234 | check_pinecone_index()
235 | # os.environ.get('PINECONE_INDEX') == st.secrets['PINECONE_INDEX'],
236 | )
237 | st.write(
238 | 'pinecone_namespace set: ',
239 | check_pinecone_namespace()
240 | # os.environ.get('PINECONE_NAMESPACE') == st.secrets['PINECONE_NAMESPACE'],
241 | )
242 | # st.write(
243 | # "pinecone_environment set: ",
244 |
245 | # # os.environ.get('PINECONE_ENVIRONMENT') == st.secrets['PINECONE_ENVIRONMENT'],
246 | # )
247 |
248 | def upload_files():
249 | uploaded_files = st.file_uploader(
250 | "Upload multiple files",
251 | type="pdf",
252 | help="docs, and txt files are still in beta.",
253 | accept_multiple_files=True,
254 | on_change=clear_submit
255 | )
256 |
257 | if uploaded_files is None:
258 | st.info("Please upload a file of type: " + ", ".join(["pdf"]))
259 | return uploaded_files
260 |
261 | # To get the tokenizer corresponding to a specific model in the OpenAI API:
262 | tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo') # specific tiktoken encoder which is used by gpt-3.5-turbo: https://github.com/openai/tiktoken/blob/main/tiktoken/model.py#L74
263 |
264 | def tiktoken_len(text):
265 | """Returns the length of the text in tokens."""
266 | tokens = tokenizer.encode(
267 | text,
268 | disallowed_special=()
269 | )
270 | return len(tokens)
271 |
272 | # Function to ingest the files
273 | def ingest_files(uploaded_files):
274 | # find API key in console at app.pinecone.io
275 | PINECONE_API_KEY = os.getenv('PINECONE_API_KEY') # or 'PINECONE_API_KEY'
276 | # find ENV (cloud region) next to API key in console
277 | PINECONE_ENVIRONMENT = os.getenv('PINECONE_ENVIRONMENT') # or 'PINECONE_ENVIRONMENT'
278 |
279 | try:
280 | with st.spinner("Indexing documents... this might take a while⏳"):
281 | # Code to ingest the files goes here...
282 | with tempfile.TemporaryDirectory() as tmpdir:
283 | for uploaded_file in uploaded_files:
284 | file_name = uploaded_file.name
285 | file_content = uploaded_file.read()
286 | st.write("Filename: ", file_name)
287 | with open(os.path.join(tmpdir, file_name), "wb") as file:
288 | file.write(file_content)
289 | loader = DirectoryLoader(tmpdir, glob="**/*.pdf", loader_cls=PyMuPDFLoader) # type: ignore
290 | documents = loader.load()
291 | text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100, length_function=tiktoken_len)
292 | documents = text_splitter.split_documents(documents)
293 | pinecone.init(
294 | api_key=PINECONE_API_KEY, # find at app.pinecone.io
295 | environment=PINECONE_ENVIRONMENT # next to api key in console
296 | )
297 | openai_api_key = os.getenv('OPENAI_API_KEY')
298 | embeddings = OpenAIEmbeddings(model='text-embedding-ada-002', openai_api_key=openai_api_key, client=None)
299 | # Pinecone.from_documents(documents, embeddings, index_name=index_name, namespace='ns1')
300 | Pinecone.from_existing_index(index_name='huxleypdf', embedding=embeddings, namespace='ns1')
301 | st.success("Ingested File!")
302 | st.session_state["api_key_configured"] = True
303 | except Exception as e:
304 | st.error(f"Error while ingesting the files: {str(e)}")
305 | return None
306 |
307 | # Function to display PDF as image on mobile devices
308 | def show_pdf_as_image(pdf_bytes):
309 | images = convert_from_bytes(pdf_bytes)
310 | for image in images:
311 | st.image(image)
312 |
313 | # Function to display PDF as iFrame on desktop
314 | def show_pdf_as_iframe(file):
315 | if file is not None:
316 | pdf_bytes = file.read()
317 | base64_pdf = base64.b64encode(pdf_bytes).decode('utf-8')
318 | pdf_display = f''
319 | st.markdown(pdf_display, unsafe_allow_html=True)
320 |
321 | pdf_reader = PdfReader(file)
322 |
323 | def main():
324 | render_header()
325 | sidebar()
326 | # setup_environment()
327 |
328 |
329 | # Upload file
330 | pdf = st.file_uploader("Upload your PDF", type="pdf")
331 |
332 | # Fetching remote PDFs using Unstructured
333 | # loader = OnlinePDFLoader("https://arxiv.org/pdf/2302.03803.pdf")
334 | # data = loader.load()
335 | # print(data)
336 |
337 | # extract the text
338 | if pdf is not None:
339 | pdf_reader = PdfReader(pdf)
340 | text = ""
341 | for page in pdf_reader.pages:
342 | text += page.extract_text()
343 |
344 | # Split into chunks
345 | text_splitter = CharacterTextSplitter(
346 | separator="\n",
347 | chunk_size=400,
348 | chunk_overlap=80, # I usually set chunk_overlap == 20% of chunk_size
349 | length_function=len
350 | )
351 | chunks = text_splitter.split_text(text)
352 |
353 | # create embeddings
354 | embeddings = OpenAIEmbeddings()
355 |
356 | #TODO: render image of pdf
357 | # show_pdf_as_iframe(pdf)
358 |
359 | knowledge_base = Pinecone.from_existing_index(index_name='huxleypdf', embedding=embeddings, namespace='ns1')
360 |
361 | # show user input
362 | user_question = st.text_input("Ask a question about your PDF: ")
363 | if user_question:
364 | docs = knowledge_base.similarity_search(user_question)
365 | llm = OpenAI()
366 | chain = load_qa_chain(llm, chain_type="stuff")
367 | with get_openai_callback() as cb:
368 | response = chain.run(input_documents=docs, question=user_question)
369 | print(cb)
370 |
371 | st.write(response)
372 |
373 | #TODO: Add error handling
374 |
375 | if __name__ == '__main__':
376 | main()
377 |
--------------------------------------------------------------------------------
/huxleychat-home.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fredsiika/huxley-pdf/02c82e78f189e55453393f84f2c5bff6967ee77e/huxleychat-home.png
--------------------------------------------------------------------------------
/huxleychat-how-it-works.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fredsiika/huxley-pdf/02c82e78f189e55453393f84f2c5bff6967ee77e/huxleychat-how-it-works.png
--------------------------------------------------------------------------------
/huxleychat-sidebar-apikey.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fredsiika/huxley-pdf/02c82e78f189e55453393f84f2c5bff6967ee77e/huxleychat-sidebar-apikey.png
--------------------------------------------------------------------------------
/huxleychat_banner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fredsiika/huxley-pdf/02c82e78f189e55453393f84f2c5bff6967ee77e/huxleychat_banner.png
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pydeck
2 | lz4
3 | certifi
4 | gunicorn
5 | anyio
6 | pyrsistent
7 | rich
8 | pymongo
9 | tiktoken
10 | asgiref
11 | websockets
12 | packaging
13 | toml
14 | backoff
15 | colorama
16 | djangorestframework
17 | zipp
18 | jmespath
19 | tenacity
20 | protobuf
21 | psycopg2-binary
22 | Pympler
23 | greenlet
24 | pandas
25 | clickhouse-connect
26 | fastapi
27 | six
28 | zstandard
29 | numpy
30 | uvicorn
31 | openapi-schema-pydantic
32 | markdown-it-py
33 | boto3
34 | mpmath
35 | Pillow
36 | dnspython
37 | watchdog
38 | PyYAML
39 | smmap
40 | Pygments
41 | s3transfer
42 | botocore
43 | pytz
44 | regex
45 | django-dotenv
46 | posthog
47 | scikit-learn
48 | h11
49 | requests
50 | typing_extensions
51 | jsonschema
52 | python-dotenv
53 | typing-inspect
54 | httptools
55 | aiosignal
56 | dataclasses-json
57 | sentry-sdk
58 | tzdata
59 | importlib-metadata
60 | pyarrow
61 | validators
62 | idna
63 | pinecone-client
64 | djangorestframework-simplejwt
65 | sentence-transformers
66 | torch
67 | mdurl
68 | scipy
69 | tzlocal
70 | gitdb
71 | hnswlib
72 | urllib3
73 | altair
74 | frozenlist
75 | threadpoolctl
76 | yarl
77 | multidict
78 | pytz-deprecation-shim
79 | starlette
80 | fsspec
81 | Jinja2
82 | async-timeout
83 | marshmallow
84 | aiohttp
85 | attrs
86 | django-ninja
87 | nltk
88 | PyMuPDF
89 | sqlparse
90 | joblib
91 | streamlit-extras
92 | tornado
93 | win32-setctime
94 | pydantic
95 | sentencepiece
96 | charset-normalizer
97 | cachetools
98 | click
99 | MarkupSafe
100 | PyJWT
101 | filelock
102 | entrypoints
103 | monotonic
104 | sympy
105 | python-dateutil
106 | tokenizers
107 | sniffio
108 | watchfiles
109 | openai
110 | django-cors-headers
111 | SQLAlchemy
112 | duckdb
113 | networkx
114 | mypy-extensions
115 | toolz
116 | streamlit
117 | Django
118 | blinker
119 | decorator
120 | GitPython
121 | tqdm
122 | torchvision
123 | PyPDF2
124 | langchain
125 | loguru
126 | transformers
127 | marshmallow-enum
128 | faiss-cpu
129 | pdf2image
--------------------------------------------------------------------------------
/templates/condense_prompt.py:
--------------------------------------------------------------------------------
1 | CONDENSE_PROMPT = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question.
2 |
3 | Chat History:
4 | {chat_history}
5 | Follow Up Input: {question}
6 | Standalone question:"""
--------------------------------------------------------------------------------
/templates/qa_prompt.py:
--------------------------------------------------------------------------------
1 | QA_PROMPT = """You are a helpful AI assistant named HuxleyPDF. Use the following pieces of context to answer the question at the end.
2 | If you don't know the answer, just say you don't know. DO NOT try to make up an answer.
3 | If the question is not related to the context, politely respond that you are tuned to only answer questions that are related to the context.
4 | Use as much detail when as possible when responding.
5 |
6 | {context}
7 |
8 | Question: {question}
9 | Helpful answer in markdown format:"""
--------------------------------------------------------------------------------
/utils/ingest.py:
--------------------------------------------------------------------------------
1 | from langchain.embeddings.openai import OpenAIEmbeddings
2 | from langchain.document_loaders import DirectoryLoader, PyMuPDFLoader
3 | from langchain.vectorstores import Pinecone
4 | import pinecone
5 | from langchain.text_splitter import RecursiveCharacterTextSplitter
6 | from langchain.vectorstores import Chroma
7 |
8 |
9 | def ingest(openai_api_key, pinecone_api_key, pinecone_environment, pinecone_index, pinecone_namespace, use_pinecone):
10 | loader = DirectoryLoader('docs', glob="**/*.pdf", loader_cls=PyMuPDFLoader)
11 | documents = loader.load()
12 |
13 | text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
14 | documents = text_splitter.split_documents(documents)
15 | embeddings = OpenAIEmbeddings(
16 | model='text-embedding-ada-002',
17 | openai_api_key=openai_api_key
18 | )
19 | model='text-embedding-ada-002'
20 | openai_api_key=openai_api_key
21 |
22 | if use_pinecone:
23 | pinecone.init(
24 | api_key=pinecone_api_key, # find at app.pinecone.io
25 | environment=pinecone_environment # next to api key in console
26 | )
27 |
28 | Pinecone.from_documents(
29 | documents, embeddings,
30 | index_name=pinecone_index,
31 | namespace=pinecone_namespace
32 | )
33 |
34 | return 'Finished Ingesting, stored at Pinecone'
35 |
36 | else:
37 | vectorstore = Chroma.from_documents(
38 | documents,
39 | embeddings,
40 | collection_name="my_collection",
41 | persist_directory="./vectorstore"
42 | )
43 |
44 | return 'Finished Ingesting, stored at ./vectorstore'
45 |
--------------------------------------------------------------------------------
/utils/query.py:
--------------------------------------------------------------------------------
1 | from langchain.embeddings.openai import OpenAIEmbeddings
2 | from langchain.chains import ConversationalRetrievalChain
3 | from langchain.chat_models import ChatOpenAI
4 | from langchain.vectorstores import Pinecone
5 | import pinecone
6 | from templates.qa_prompt import QA_PROMPT
7 | from templates.condense_prompt import CONDENSE_PROMPT
8 | from langchain.vectorstores import Chroma
9 |
10 |
11 | def query(openai_api_key, pinecone_api_key, pinecone_environment, pinecone_index, pinecone_namespace, temperature, sources, use_pinecone):
12 | embeddings = OpenAIEmbeddings(
13 | model='text-embedding-ada-002', openai_api_key=openai_api_key)
14 |
15 | if use_pinecone:
16 | pinecone.init(api_key=pinecone_api_key,
17 | environment=pinecone_environment)
18 | vectorstore = Pinecone.from_existing_index(
19 | index_name=pinecone_index, embedding=embeddings, text_key='text', namespace=pinecone_namespace)
20 | else:
21 |
22 | # Load in persisted database from disk
23 | persist_directory = "./vectorstore"
24 | vectorstore = Chroma(
25 | persist_directory=persist_directory, embedding_function=embeddings, collection_name="my_collection")
26 |
27 | model = ChatOpenAI(model_name='gpt-3.5-turbo', temperature=temperature,
28 | openai_api_key=openai_api_key, streaming=True) # max temperature is 2 least is 0
29 | retriever = vectorstore.as_retriever(
30 | search_kwargs={"k": sources},
31 | qa_template=QA_PROMPT,
32 | question_generator_template=CONDENSE_PROMPT
33 | ) # 9 is the max sources
34 |
35 | qa = ConversationalRetrievalChain.from_llm(
36 | llm=model,
37 | retriever=retriever,
38 | return_source_documents=True
39 | )
40 | return qa
41 |
--------------------------------------------------------------------------------