├── .devcontainer
    └── devcontainer.json
├── .gitattributes
├── .github
    └── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   └── feature_request.md
├── .gitignore
├── .grit
    └── .gitignore
├── LICENSE
├── README.md
├── api.py
├── chat_history.json
├── chatbot.py
├── docs
    └── white_paper.pdf
├── example.env
├── huxley.py
├── huxleychat-home.png
├── huxleychat-how-it-works.png
├── huxleychat-sidebar-apikey.png
├── huxleychat_banner.png
├── requirements.txt
├── templates
    ├── condense_prompt.py
    └── qa_prompt.py
└── utils
    ├── ingest.py
    └── query.py


/.devcontainer/devcontainer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "Python 3",
 3 |   // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile
 4 |   "image": "mcr.microsoft.com/devcontainers/python:1-3.11-bullseye",
 5 |   "customizations": {
 6 |     "codespaces": {
 7 |       "openFiles": [
 8 |         "README.md",
 9 |         "huxley.py"
10 |       ]
11 |     },
12 |     "vscode": {
13 |       "settings": {},
14 |       "extensions": [
15 |         "ms-python.python",
16 |         "ms-python.vscode-pylance"
17 |       ]
18 |     }
19 |   },
20 |   "updateContentCommand": "[ -f packages.txt ] && sudo apt update && sudo apt upgrade -y && sudo xargs apt install -y <packages.txt; [ -f requirements.txt ] && pip3 install --user -r requirements.txt; pip3 install --user streamlit; echo '✅ Packages installed and Requirements met'",
21 |   "postAttachCommand": {
22 |     "server": "streamlit run huxley.py --server.enableCORS false --server.enableXsrfProtection false"
23 |   },
24 |   "portsAttributes": {
25 |     "8501": {
26 |       "label": "Application",
27 |       "onAutoForward": "openPreview"
28 |     }
29 |   },
30 |   "forwardPorts": [
31 |     8501
32 |   ]
33 | }


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: "[BUG]"
 5 | labels: bug
 6 | assignees: fredsiika
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 | 
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 | 
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 | 
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 | 
26 | **Desktop (please complete the following information):**
27 |  - OS: [e.g. iOS]
28 |  - Browser [e.g. chrome, safari]
29 |  - Version [e.g. 22]
30 | 
31 | **Smartphone (please complete the following information):**
32 |  - Device: [e.g. iPhone6]
33 |  - OS: [e.g. iOS8.1]
34 |  - Browser [e.g. stock browser, safari]
35 |  - Version [e.g. 22]
36 | 
37 | **Additional context**
38 | Add any other context about the problem here.
39 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: "[FEATURE]"
 5 | labels: enhancement
 6 | assignees: fredsiika
 7 | 
 8 | ---
 9 | 
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 | 
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
105 | __pypackages__/
106 | 
107 | # Celery stuff
108 | celerybeat-schedule
109 | celerybeat.pid
110 | 
111 | # SageMath parsed files
112 | *.sage.py
113 | 
114 | # Environments
115 | .env
116 | .venv
117 | venv2
118 | env/
119 | venv/
120 | ENV/
121 | env.bak/
122 | venv.bak/
123 | vectorstore
124 | 
125 | # Spyder project settings
126 | .spyderproject
127 | .spyproject
128 | 
129 | # Rope project settings
130 | .ropeproject
131 | 
132 | # mkdocs documentation
133 | /site
134 | 
135 | # mypy
136 | .mypy_cache/
137 | .dmypy.json
138 | dmypy.json
139 | 
140 | # Pyre type checker
141 | .pyre/
142 | 
143 | # pytype static type analyzer
144 | .pytype/
145 | 
146 | # Cython debug symbols
147 | cython_debug/
148 | 
149 | # PyCharm
150 | #  JetBrains specific template is maintainted in a separate JetBrains.gitignore that can
151 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
152 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
153 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
154 | #.idea/
155 | 
156 | # secrets
157 | config.ini
158 | .streamlit/*
159 | .streamlit/secrets.toml
160 | secrets.toml
161 | 
162 | vectors/*
163 | 
164 | huxleypdfenv/*
165 | myenv/*
166 | venv*
167 | .DS_Store
168 | 
169 | # env fils
170 | .env
171 | .env.*
172 | 


--------------------------------------------------------------------------------
/.grit/.gitignore:
--------------------------------------------------------------------------------
1 | .gritmodules*
2 | *.log
3 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Haste171
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 🗂 Huxley PDF
 2 | 
 3 | Chat with your personal PDF docs.
 4 | 
 5 | ![Huxley PDF](huxleychat_banner.png)
 6 | 
 7 | ## Overview
 8 | 
 9 | **Highlevel overview of this streamlit app by file.**
10 | 
11 | [Click here to skip to the installation instructions](#installation)
12 | 
13 | ![HuxleyPDF](huxleychat-home.png)
14 | 
15 | ![HuxleyPDF](huxleychat-how-it-works.png)
16 | 
17 | ![HuxleyPDF](huxleychat-sidebar-apikey.png)
18 | 
19 | ### `Huxley.py`
20 | 
21 | The `main()` function is responsible for handling the user interface and processing the uploaded PDF file. Here's a breakdown of the code:
22 | 
23 | 1. The `render_header()` function is called to display the header section of the application. It includes the title, description, and an image.
24 | 
25 | 2. The `sidebar()` function is called to display the sidebar section of the application. It includes information about HuxleyPDF, instructions on how to use it, and input fields for the OpenAI API key.
26 | 
27 | 3. The `setup_environment()` function is called to set up the environment. Currently, it only prints a message indicating that the setup is in progress.
28 | 
29 | 4. The `st.file_uploader()` function is used to upload a PDF file. The user is prompted to select a file with the description "Upload your PDF" and the file type filter set to "pdf".
30 | 
31 | 5. The code then fetches a remote PDF file using the `OnlinePDFLoader` class from the Unstructured library. This is commented out for now.
32 | 
33 | 6. If a PDF file is uploaded, the code extracts the text from the PDF using the `PdfReader` class from the PyMuPDF library.
34 | 
35 | 7. The extracted text is split into chunks using the `CharacterTextSplitter` class from the LangChain library. The chunk size is set to 400 characters, and the overlap between chunks is set to 80 characters.
36 | 
37 | 8. The `OpenAIEmbeddings` class is used to create embeddings for the chunks of text.
38 | 
39 | 9. The `FAISS.from_texts()` function is used to create a FAISS index from the chunks of text and their embeddings. This is commented out for now.
40 | 
41 | 10. The user is prompted to enter a question about the PDF using the `st.text_input()` function.
42 | 
43 | 11. If a question is entered, the code retrieves the documents from the FAISS index that are most similar to the user's question using the `similarity_search()` method.
44 | 
45 | 12. The `OpenAI()` class is used to create an instance of the OpenAI API.
46 | 
47 | 13. The `load_qa_chain()` function is used to create a question-answering chain using the OpenAI API and the "stuff" chain type.
48 | 
49 | 14. The `get_openai_callback()` context manager is used to capture the callback information from the OpenAI API.
50 | 
51 | 15. The `chain.run()` method is used to run the question-answering chain on the input documents and the user's question. The response is printed.
52 | 
53 | 16. The response is displayed using the `st.write()` function.
54 | 
55 | Overall, the code within the `main()` function handles the user interface, processes the uploaded PDF file, and performs a question-answering task using the OpenAI API and the LangChain library.
56 | 
57 | ## Installation
58 | 
59 | ## Troubleshoot
60 | 


--------------------------------------------------------------------------------
/api.py:
--------------------------------------------------------------------------------
  1 | from flask import Flask, request
  2 | from flask_restful import Resource, Api, reqparse, abort
  3 | from werkzeug.utils import secure_filename
  4 | 
  5 | ########################################################################
  6 | import tempfile
  7 | import os
  8 | from langchain.document_loaders import DirectoryLoader, PyMuPDFLoader
  9 | from langchain.text_splitter import RecursiveCharacterTextSplitter
 10 | from langchain.embeddings.openai import OpenAIEmbeddings
 11 | from langchain.chains import ConversationalRetrievalChain
 12 | from langchain.chat_models import ChatOpenAI
 13 | from langchain.vectorstores import Pinecone
 14 | import pinecone
 15 | from templates.qa_prompt import QA_PROMPT
 16 | from templates.condense_prompt import CONDENSE_PROMPT
 17 | 
 18 | from dotenv import load_dotenv
 19 | load_dotenv()
 20 | openai_api_key_env = os.environ.get('OPENAI_API_KEY')
 21 | pinecone_api_key_env = os.environ.get('PINECONE_API_KEY')
 22 | pinecone_environment_env = os.environ.get('PINECONE_ENVIRONMENT')
 23 | pinecone_index_env = os.environ.get('PINECONE_INDEX')
 24 | 
 25 | pinecone_namespace = 'testing-pdf-2389203901'
 26 | 
 27 | app = Flask("L-ChatBot")
 28 | UPLOAD_FOLDER = 'documents'
 29 | ALLOWED_EXTENSIONS = {'pdf'}
 30 | 
 31 | 
 32 | app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
 33 | api = Api(app)
 34 | 
 35 | parser = reqparse.RequestParser()
 36 | 
 37 | 
 38 | def get_answer(message, temperature=0.7, source_amount=4):
 39 |     chat_history = []
 40 |     embeddings = OpenAIEmbeddings(
 41 |         model='text-embedding-ada-002', openai_api_key=openai_api_key_env)
 42 | 
 43 |     pinecone.init(api_key=pinecone_api_key_env,
 44 |                   environment=pinecone_environment_env)
 45 |     vectorstore = Pinecone.from_existing_index(
 46 |         index_name=pinecone_index_env, embedding=embeddings, text_key='text', namespace=pinecone_namespace)
 47 |     model = ChatOpenAI(model_name='gpt-3.5-turbo', temperature=temperature,
 48 |                        openai_api_key=openai_api_key_env, streaming=False)  # max temperature is 2 least is 0
 49 |     retriever = vectorstore.as_retriever(search_kwargs={
 50 |         "k": source_amount},  qa_template=QA_PROMPT, question_generator_template=CONDENSE_PROMPT)  # 9 is the max sources
 51 |     qa = ConversationalRetrievalChain.from_llm(
 52 |         llm=model, retriever=retriever, return_source_documents=True)
 53 |     result = qa({"question": message, "chat_history": chat_history})
 54 |     print("Cevap Geldi")
 55 |     answer = result["answer"]
 56 |     source_documents = result['source_documents']
 57 | 
 58 |     parsed_documents = []
 59 |     for doc in source_documents:
 60 |         parsed_doc = {
 61 |             "page_content": doc.page_content,
 62 |             "metadata": {
 63 |                 "author": doc.metadata.get("author", ""),
 64 |                 "creationDate": doc.metadata.get("creationDate", ""),
 65 |                 "creator": doc.metadata.get("creator", ""),
 66 |                 "file_path": doc.metadata.get("file_path", ""),
 67 |                 "format": doc.metadata.get("format", ""),
 68 |                 "keywords": doc.metadata.get("keywords", ""),
 69 |                 "modDate": doc.metadata.get("modDate", ""),
 70 |                 "page_number": doc.metadata.get("page_number", 0),
 71 |                 "producer": doc.metadata.get("producer", ""),
 72 |                 "source": doc.metadata.get("source", ""),
 73 |                 "subject": doc.metadata.get("subject", ""),
 74 |                 "title": doc.metadata.get("title", ""),
 75 |                 "total_pages": doc.metadata.get("total_pages", 0),
 76 |                 "trapped": doc.metadata.get("trapped", "")
 77 |             }
 78 |         }
 79 |         parsed_documents.append(parsed_doc)
 80 | 
 81 |     # Display the response in the Streamlit app
 82 |     return {
 83 |         "answer": answer,
 84 |         "meta": parsed_documents
 85 |     }
 86 | ########################################################################
 87 | 
 88 | 
 89 | class Ask(Resource):
 90 | 
 91 |     def get(self):
 92 |         question = request.args.get("question")
 93 |         temp = request.args.get("temp", default=0.7)
 94 |         sources = request.args.get("sources", default=4)
 95 |         return get_answer(question, float(temp), int(sources))
 96 | 
 97 | 
 98 | class Ingest(Resource):
 99 | 
100 |     def allowed_file(self, filename):
101 |         return '.' in filename and \
102 |             filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
103 | 
104 |     def post(self):
105 |         # Get Text type fields
106 |         if 'file' not in request.files:
107 |             return 'No file part'
108 | 
109 |         file = request.files.get("file")
110 |         if file and self.allowed_file(file.filename):
111 |             filename = secure_filename(file.filename)
112 |             file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename))
113 |             loader = DirectoryLoader(
114 |                 app.config['UPLOAD_FOLDER'], glob="**/*.pdf", loader_cls=PyMuPDFLoader)
115 |             documents = loader.load()
116 |             text_splitter = RecursiveCharacterTextSplitter(
117 |                 chunk_size=1000, chunk_overlap=100)
118 |             documents = text_splitter.split_documents(documents)
119 | 
120 |             pinecone.init(
121 |                 api_key=pinecone_api_key_env,  # find at app.pinecone.io
122 |                 environment=pinecone_environment_env  # next to api key in console
123 |             )
124 |             embeddings = OpenAIEmbeddings(
125 |                 model='text-embedding-ada-002', openai_api_key=openai_api_key_env)
126 |             Pinecone.from_documents(
127 |                 documents, embeddings, index_name=pinecone_index_env, namespace=pinecone_namespace)
128 |             return 'File uploaded and ingested successfully'
129 | 
130 | 
131 | api.add_resource(Ask, "/ask")
132 | api.add_resource(Ingest, "/ingest")
133 | 
134 | if __name__ == "__main__":
135 |     app.run()
136 | 


--------------------------------------------------------------------------------
/chat_history.json:
--------------------------------------------------------------------------------
1 | []


--------------------------------------------------------------------------------
/chatbot.py:
--------------------------------------------------------------------------------
  1 | from colorama import init, Fore, Style
  2 | import os
  3 | import json
  4 | from utils.ingest import ingest
  5 | from utils.query import query
  6 | 
  7 | from dotenv import load_dotenv
  8 | load_dotenv()
  9 | 
 10 | init()
 11 | 
 12 | openai_api_key = os.environ.get('OPENAI_API_KEY')
 13 | pinecone_api_key = os.environ.get('PINECONE_API_KEY')
 14 | pinecone_environment = os.environ.get('PINECONE_ENVIRONMENT')
 15 | pinecone_index = os.environ.get('PINECONE_INDEX')
 16 | pinecone_namespace = 'testing-pdf-0001'
 17 | temperature = 0.1
 18 | source_amount = 3
 19 | 
 20 | startup = f"""{Fore.WHITE}Using the following credentials:{Fore.WHITE}
 21 | OpenAI API Key: {Fore.RED}{openai_api_key}{Fore.WHITE}
 22 | Pinecone API Key: {Fore.BLUE}{pinecone_api_key}{Fore.WHITE}
 23 | Pinecone Environment: {Fore.BLUE}{pinecone_environment}{Fore.WHITE}
 24 | Pinecone Index: {Fore.BLUE}{pinecone_index}{Fore.WHITE}
 25 | Pinecone Namespace: {Fore.GREEN}{pinecone_namespace}{Fore.WHITE}
 26 | 
 27 | {Fore.WHITE}Using the following settings:{Fore.WHITE}
 28 | Temperature (Creativity): {Fore.MAGENTA}{temperature}{Fore.WHITE}
 29 | Sources (Cites): {Fore.MAGENTA}{source_amount}{Fore.WHITE}
 30 | """
 31 | print(startup)
 32 | 
 33 | r = input('Do you want to use Pinecone? (Y/N): ')
 34 | if r == 'Y' and pinecone_api_key != '':
 35 |     use_pinecone = True
 36 | else:
 37 |     print('Not using Pinecone or empty Pinecone API key provided. Using Chroma instead')
 38 |     use_pinecone = False
 39 | 
 40 | r = input('Do you want to ingest? (Y/N): ')
 41 | 
 42 | # os.system('cls')
 43 | if r == 'Y':
 44 |     ingest_response = ingest(openai_api_key=openai_api_key, pinecone_api_key=pinecone_api_key,
 45 |                              pinecone_environment=pinecone_environment, pinecone_index=pinecone_index,
 46 |                              pinecone_namespace=pinecone_namespace, use_pinecone=use_pinecone)
 47 |     print(ingest_response)
 48 | elif r == 'N':
 49 |     if use_pinecone:
 50 |         print('Using already ingested namespace at Pinecone.')
 51 |     else:
 52 |         print('Using already ingested vectors at ./vectorstore.')
 53 | else:
 54 |     print('No method given, passing')
 55 |     pass
 56 | 
 57 | process = query(openai_api_key=openai_api_key, pinecone_api_key=pinecone_api_key,
 58 |                 pinecone_environment=pinecone_environment, pinecone_index=pinecone_index,
 59 |                 pinecone_namespace=pinecone_namespace, temperature=temperature, sources=source_amount, use_pinecone=use_pinecone)
 60 | 
 61 | def chat_loop():
 62 |     chat_history = []
 63 |     while True:
 64 |         query = input("Please enter your question (or type 'exit' to end): ")
 65 |         if query.lower() == 'exit':
 66 |             break
 67 |         result = process({"question": query, "chat_history": chat_history})
 68 |         source_documents = result['source_documents']
 69 | 
 70 |         parsed_documents = []
 71 |         for doc in source_documents:
 72 |             parsed_doc = {
 73 |                 "page_content": doc.page_content,
 74 |                 "metadata": {
 75 |                     "author": doc.metadata.get("author", ""),
 76 |                     "creationDate": doc.metadata.get("creationDate", ""),
 77 |                     "creator": doc.metadata.get("creator", ""),
 78 |                     "file_path": doc.metadata.get("file_path", ""),
 79 |                     "format": doc.metadata.get("format", ""),
 80 |                     "keywords": doc.metadata.get("keywords", ""),
 81 |                     "modDate": doc.metadata.get("modDate", ""),
 82 |                     "page_number": doc.metadata.get("page_number", 0),
 83 |                     "producer": doc.metadata.get("producer", ""),
 84 |                     "source": doc.metadata.get("source", ""),
 85 |                     "subject": doc.metadata.get("subject", ""),
 86 |                     "title": doc.metadata.get("title", ""),
 87 |                     "total_pages": doc.metadata.get("total_pages", 0),
 88 |                     "trapped": doc.metadata.get("trapped", "")
 89 |                 }
 90 |             }
 91 |             parsed_documents.append(parsed_doc)
 92 | 
 93 |         print(
 94 |             f'{Fore.BLUE}{Style.BRIGHT}AI:{Fore.RESET}{Style.NORMAL} {result["answer"]}')
 95 |         chat_history.append((query, result["answer"]))
 96 | 
 97 |         print(f'\n{Fore.RED}Answer Citations')
 98 |         for doc in parsed_documents:
 99 |             print(f"{Fore.GREEN}{Style.BRIGHT}Source:{Fore.RESET}",
100 |                   doc["metadata"]["source"])
101 |             print(f"{Fore.MAGENTA}Page Number:{Fore.RESET}",
102 |                   doc["metadata"]["page_number"], f"{Style.NORMAL}")
103 | 
104 |         # Write chat history to a JSON file
105 |         with open('chat_history.json', 'w') as json_file:
106 |             json.dump(chat_history, json_file, ensure_ascii=False, indent=4)
107 | 
108 | chat_loop()
109 | 


--------------------------------------------------------------------------------
/docs/white_paper.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fredsiika/huxley-pdf/02c82e78f189e55453393f84f2c5bff6967ee77e/docs/white_paper.pdf


--------------------------------------------------------------------------------
/example.env:
--------------------------------------------------------------------------------
1 | OPENAI_API_KEY=
2 | PINECONE_API_KEY=
3 | PINECONE_ENVIRONMENT=
4 | PINECONE_INDEX=
5 | 


--------------------------------------------------------------------------------
/huxley.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | import base64
  4 | import logging
  5 | import tempfile
  6 | import requests
  7 | import tiktoken
  8 | import pinecone
  9 | from io import BytesIO
 10 | import streamlit as st
 11 | 
 12 | from PyPDF2 import PdfReader
 13 | from dotenv import load_dotenv
 14 | from langchain.llms import OpenAI # type: ignore
 15 | from pdf2image import convert_from_bytes
 16 | 
 17 | from langchain.vectorstores import FAISS
 18 | from langchain.vectorstores import Pinecone
 19 | from langchain.chat_models import ChatOpenAI
 20 | from langchain.callbacks import get_openai_callback
 21 | from langchain.embeddings.openai import OpenAIEmbeddings
 22 | from langchain.chains.question_answering import load_qa_chain
 23 | from streamlit_extras.add_vertical_space import add_vertical_space
 24 | from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
 25 | from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader, PyMuPDFLoader, OnlinePDFLoader
 26 |     
 27 | loader = PyPDFLoader('docs/white_paper.pdf')
 28 | pages = []
 29 | 
 30 | print(loader)
 31 | 
 32 | from templates.qa_prompt import QA_PROMPT
 33 | from templates.condense_prompt import CONDENSE_PROMPT
 34 | 
 35 | load_dotenv()
 36 | logging.basicConfig(level=logging.DEBUG)
 37 | config = st.set_page_config(page_title='HuxleyPDF | by Fred Siika', page_icon='🗂', layout='wide')
 38 | 
 39 | # index = 'huxleypdf'
 40 | # openai_api_key=os.environ['OPENAI_API_KEY']
 41 | 
 42 | def check_openai_api_key():
 43 |     st.info("Please add your OpenAI API key to begin.")
 44 |     openai_api_key = st.text_input("OpenAI API Key", type="password")
 45 |     if not openai_api_key:
 46 |         st.stop()
 47 |         return False
 48 |     else:
 49 |         os.environ['OPENAI_API_KEY'] = openai_api_key
 50 |         st.success("API key set: " + openai_api_key[:5] + "..." + openai_api_key[-5:])
 51 |         return True
 52 | 
 53 | def check_pinecone_api_key():
 54 |     st.info("Please add your Pinecone API key to continue.")
 55 |     pinecone_api_key = st.text_input("Pinecone API Key", type="password")
 56 |     if not pinecone_api_key:
 57 |         st.stop()
 58 |         return False
 59 |     else:
 60 |         os.environ['PINECONE_API_KEY'] = pinecone_api_key
 61 |         st.success("API key set: " + pinecone_api_key[:5] + "..." + pinecone_api_key[-5:])
 62 |         return True
 63 | def check_pinecone_index():
 64 |     st.info("Please add your Pinecone index to continue to begin. If you don't have one use the demo `huxleypdf`")
 65 |     pinecone_index = st.text_input("Pinecone Index")
 66 |     if not pinecone_index:
 67 |         st.stop()
 68 |         return False
 69 |     else:
 70 |         os.environ['PINECONE_INDEX'] = pinecone_index
 71 |         st.success("Index set: " + pinecone_index)
 72 |         return True
 73 | 
 74 | def check_pinecone_namespace():
 75 |     st.info("Please add your Pinecone namespace to continue. If you don't have one use the demo `ns1`")
 76 |     pinecone_namespace = st.text_input("Pinecone Namespace")
 77 |     if not pinecone_namespace:
 78 |         st.stop()
 79 |         return False
 80 |     else:
 81 |         os.environ['PINECONE_NAMESPACE'] = pinecone_namespace
 82 |         st.success("Namespace set: " + pinecone_namespace)
 83 |         return True  
 84 | 
 85 | def render_header():
 86 |    # Start Top Information
 87 |     st.title('🗂 HuxleyPDF')
 88 |     col1, col2 = st.columns(2)
 89 |     with col1:
 90 |         st.markdown(("### LLM Assisted Custom Knowledgebase "
 91 |                         "\n\n"
 92 |                         "HuxleyPDF is a Python application that allows you to upload a PDF and ask questions about it using natural language."
 93 |                         "\n\n"
 94 |                         "#### How it works "
 95 |                         "\n\n"
 96 |                         "Upload personal docs and Chat with your PDF files with this GPT4-powered app. "
 97 |                         "\n\n"
 98 |                         "This tool is powered by [OpenAI](https://openai.com)"
 99 |                         "[LangChain](<https://langchain.com/>), and [OpenAI](<https://openai.com>) and made by "
100 |                         "[@fredsiika](<https://twitter.com/fredsiika>)."
101 |                         "\n\n"
102 |                         "View Source Code on [Github](<https://github.com/fredsiika/huxley-pdf/blob/main/huxley.py>)"
103 |                     ))
104 |     with col2:
105 |         st.image(image='huxleychat_banner.png', width=300, caption='Tutorial and accompanying documentation coming soon.')
106 |     # End Top Information
107 |     return
108 | 
109 | # Function to set up the environment
110 | def setup_environment():
111 |     print('Setting up environment')
112 |     # connect_to_pinecone(index)
113 | 
114 | def connect_to_pinecone(index_name):
115 |     """Connect to Pinecone and return the index."""
116 | 
117 |     # find API key in console at app.pinecone.io
118 |     PINECONE_API_KEY = os.getenv('PINECONE_API_KEY') # or 'PINECONE_API_KEY'
119 |     # find ENV (cloud region) next to API key in console
120 |     PINECONE_ENVIRONMENT = os.getenv('PINECONE_ENVIRONMENT') # or 'PINECONE_ENVIRONMENT'
121 | 
122 |     openai_model= 'gpt-3.5-turbo'
123 |     temperature = 0.5
124 |     
125 |     # initialize pinecone
126 |     pinecone.init(
127 |         api_key=PINECONE_API_KEY,  # find at app.pinecone.io
128 |         environment=PINECONE_ENVIRONMENT  # next to api key in console
129 |     )
130 |     
131 |     model = ChatOpenAI(
132 |         model_name=openai_model,
133 |         temperature=temperature,
134 |         openai_api_key=os.getenv("OPENAI_API_KEY"),
135 |         streaming=False
136 |     )  # max temperature is 2 least is 0
137 |     
138 |     # only create index if it doesn't exist
139 |     if index_name not in pinecone.list_indexes():
140 |         pinecone.create_index(
141 |             name=index_name,
142 |             dimension=model.get_sentence_embedding_dimension(),
143 |             metric='cosine'
144 |         )
145 | 
146 |     # now connect to the index
147 |     print(f"Connecting to Pinecone..\nindex_name: {index_name}")
148 |     index = pinecone.GRPCIndex(index_name)
149 |     
150 |     # wait a moment for the index to be fully initialized
151 |     time.sleep(1)
152 |     
153 |     loader = PyMuPDFLoader("./docs/white_paper.pdf")
154 |     documents = loader.load()
155 |     text_splitter = CharacterTextSplitter(chunk_size=400, chunk_overlap=20)
156 |     docs = text_splitter.split_documents(documents)
157 |     embeddings = OpenAIEmbeddings()
158 |     
159 |     # if you already have an index, you can load it like this
160 |     docsearch = Pinecone.from_documents(docs, embeddings, index_name=index_name)
161 |     query = "Why did the chicken cross the road?"
162 |     docs = docsearch.similarity_search(query)
163 |     print(f'\n{docs[0].page_content}\n')
164 |     
165 |     # print(f"\nClients connected to Pinecone index {index_name} \n{index.describe_index_stats()}\n")
166 |     return index.describe_index_stats()
167 | 
168 | def clear_submit():
169 |     st.session_state["submit"] = False
170 | 
171 | def sidebar():
172 |     with st.sidebar:
173 |         st.markdown('''## About HuxleyPDF''')
174 |         st.markdown('''    
175 |             HuxleyPDF is a Python application that allows you to upload a PDF and ask questions about it using natural language.
176 |             
177 |             ## How it works:
178 |             
179 |             Upload personal docs and Chat with your PDF files with this GPT4-powered app. 
180 |             Built with [LangChain](https://docs.langchain.com/docs/), [Pinecone Vector Db](https://pinecone.io/), deployed on [Streamlit](https://streamlit.io)
181 | 
182 |             ## How to use:
183 |             
184 |             1. Upload a PDF
185 |             2. Ask a question about the PDF
186 |             3. Get an answer about the PDF
187 |             4. Repeat
188 |             
189 |             ## Before you start using HuxleyPDF:
190 |             
191 |             - You need to have an OpenAI API key. You can get one [here](https://api.openai.com/).
192 |             - You need to have a Pinecone API key. You can get one [here](https://www.pinecone.io/).
193 |             - You need to have a Pinecone environment. You can create one [here](https://www.pinecone.io/).
194 |             
195 |             ## How to obtain your OpenAI API key:
196 | 
197 |             1. Sign in to your OpenAI account. If you do not have an account, [click here](https://platform.openai.com/signup) to sign up.
198 |             
199 |             2. Visit the [OpenAI API keys page.](https://platform.openai.com/account/api-keys)
200 |             open-key-create
201 |         
202 |             ![Step 1 and 2 Create an API Key Screenshot](https://www.usechatgpt.ai/assets/chrome-extension/open-key-create.png)
203 |             
204 |             3. Create a new secret key and copy & paste it into the "API key" input field below.👇🏾
205 |         ''')
206 |         
207 |         st.markdown('''
208 |             ## OpenAI API key
209 |             
210 |             **Tips:**
211 |             
212 |             - The official OpenAI API is more stable than the ChatGPT free plan. However, charges based on usage do apply.
213 |             - Your API Key is saved locally on your browser and not transmitted anywhere else.
214 |             - If you provide an API key enabled with GPT-4, the extension will support GPT-4.
215 |             - Your free OpenAI API key could expire at some point, therefore please check [the expiration status of your API key here.](https://platform.openai.com/account/usage)
216 |             - Access to ChatGPT may be unstable when demand is high for free OpenAI API key.
217 |             
218 |         ''')
219 |         add_vertical_space(5)
220 |         st.write('[HuxleyPDF](https://github.com/fredsiika/huxley-pdf) was made with ❤️ by [Fred](https://github.com/fredsiika)')
221 |         
222 |         st.write(
223 |             "openai_api_key set: ",
224 |             check_openai_api_key()
225 |             # f'<span style="color:green;">{True}</span>' if os.environ.get('OPENAI_API_KEY') else f'<span style="color:red;">{False}</span>'
226 |         )
227 |         st.write(
228 |             "pinecone_api set: ",
229 |             check_pinecone_api_key()
230 |             # True if os.environ.get('PINECONE_API_KEY') == st.secrets['PINECONE_API_KEY'] else False   
231 |         )
232 |         st.write(
233 |             "pinecone_index set set:",
234 |             check_pinecone_index()
235 |             # os.environ.get('PINECONE_INDEX') == st.secrets['PINECONE_INDEX'],
236 |         )
237 |         st.write(
238 |             'pinecone_namespace set: ',
239 |             check_pinecone_namespace()
240 |             # os.environ.get('PINECONE_NAMESPACE') == st.secrets['PINECONE_NAMESPACE'],
241 |         )
242 |         # st.write(
243 |         #     "pinecone_environment set: ",
244 |             
245 |         #     # os.environ.get('PINECONE_ENVIRONMENT') == st.secrets['PINECONE_ENVIRONMENT'],
246 |         # )
247 | 
248 | def upload_files():
249 |     uploaded_files = st.file_uploader(
250 |         "Upload multiple files",
251 |         type="pdf",
252 |         help="docs, and txt files are still in beta.",
253 |         accept_multiple_files=True,
254 |         on_change=clear_submit
255 |     )
256 |     
257 |     if uploaded_files is None:
258 |         st.info("Please upload a file of type: " + ", ".join(["pdf"]))
259 |     return uploaded_files
260 | 
261 | # To get the tokenizer corresponding to a specific model in the OpenAI API:
262 | tokenizer = tiktoken.encoding_for_model('gpt-3.5-turbo') # specific tiktoken encoder which is used by gpt-3.5-turbo: https://github.com/openai/tiktoken/blob/main/tiktoken/model.py#L74
263 | 
264 | def tiktoken_len(text):
265 |     """Returns the length of the text in tokens."""
266 |     tokens = tokenizer.encode(
267 |         text, 
268 |         disallowed_special=()
269 |     )
270 |     return len(tokens)
271 | 
272 | # Function to ingest the files
273 | def ingest_files(uploaded_files):
274 |     # find API key in console at app.pinecone.io
275 |     PINECONE_API_KEY = os.getenv('PINECONE_API_KEY') # or 'PINECONE_API_KEY'
276 |     # find ENV (cloud region) next to API key in console
277 |     PINECONE_ENVIRONMENT = os.getenv('PINECONE_ENVIRONMENT') # or 'PINECONE_ENVIRONMENT'
278 | 
279 |     try:
280 |         with st.spinner("Indexing documents... this might take a while⏳"):
281 |             # Code to ingest the files goes here...
282 |             with tempfile.TemporaryDirectory() as tmpdir:
283 |                 for uploaded_file in uploaded_files:
284 |                     file_name = uploaded_file.name
285 |                     file_content = uploaded_file.read()
286 |                     st.write("Filename: ", file_name)
287 |                     with open(os.path.join(tmpdir, file_name), "wb") as file:
288 |                         file.write(file_content)
289 |                 loader = DirectoryLoader(tmpdir, glob="**/*.pdf", loader_cls=PyMuPDFLoader) # type: ignore
290 |                 documents = loader.load()
291 |                 text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100, length_function=tiktoken_len)
292 |                 documents = text_splitter.split_documents(documents)
293 |                 pinecone.init(
294 |                     api_key=PINECONE_API_KEY,  # find at app.pinecone.io
295 |                     environment=PINECONE_ENVIRONMENT  # next to api key in console
296 |                 )
297 |                 openai_api_key = os.getenv('OPENAI_API_KEY')
298 |                 embeddings = OpenAIEmbeddings(model='text-embedding-ada-002', openai_api_key=openai_api_key, client=None)
299 |                 # Pinecone.from_documents(documents, embeddings, index_name=index_name, namespace='ns1')
300 |                 Pinecone.from_existing_index(index_name='huxleypdf', embedding=embeddings, namespace='ns1')
301 |                 st.success("Ingested File!")
302 |             st.session_state["api_key_configured"] = True
303 |     except Exception as e:
304 |         st.error(f"Error while ingesting the files: {str(e)}")
305 |         return None
306 | 
307 | # Function to display PDF as image on mobile devices
308 | def show_pdf_as_image(pdf_bytes):
309 |     images = convert_from_bytes(pdf_bytes)
310 |     for image in images:
311 |         st.image(image)
312 |     
313 | # Function to display PDF as iFrame on desktop
314 | def show_pdf_as_iframe(file):
315 |     if file is not None:
316 |         pdf_bytes = file.read()
317 |         base64_pdf = base64.b64encode(pdf_bytes).decode('utf-8')
318 |         pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="500" height="500" type="application/pdf"></iframe>'
319 |         st.markdown(pdf_display, unsafe_allow_html=True)
320 |         
321 |         pdf_reader = PdfReader(file)
322 | 
323 | def main():
324 |     render_header()
325 |     sidebar()
326 |     # setup_environment()
327 |     
328 |     
329 |     # Upload file
330 |     pdf = st.file_uploader("Upload your PDF", type="pdf")
331 |     
332 |     # Fetching remote PDFs using Unstructured
333 |     # loader = OnlinePDFLoader("https://arxiv.org/pdf/2302.03803.pdf")
334 |     # data = loader.load()
335 |     # print(data)
336 |     
337 |     # extract the text
338 |     if pdf is not None:
339 |         pdf_reader = PdfReader(pdf)
340 |         text = ""
341 |         for page in pdf_reader.pages:
342 |             text += page.extract_text()
343 |         
344 |         # Split into chunks
345 |         text_splitter = CharacterTextSplitter(
346 |             separator="\n",
347 |             chunk_size=400,
348 |             chunk_overlap=80, # I usually set chunk_overlap == 20% of chunk_size
349 |             length_function=len
350 |         )
351 |         chunks = text_splitter.split_text(text)
352 |         
353 |         # create embeddings
354 |         embeddings = OpenAIEmbeddings()
355 |         
356 |         #TODO:  render image of pdf
357 |         # show_pdf_as_iframe(pdf)
358 | 
359 |         knowledge_base = Pinecone.from_existing_index(index_name='huxleypdf', embedding=embeddings, namespace='ns1')
360 | 
361 |         # show user input
362 |         user_question = st.text_input("Ask a question about your PDF: ")
363 |         if user_question:
364 |             docs = knowledge_base.similarity_search(user_question)
365 |             llm = OpenAI()
366 |             chain = load_qa_chain(llm, chain_type="stuff")
367 |             with get_openai_callback() as cb:
368 |                 response = chain.run(input_documents=docs, question=user_question)
369 |                 print(cb)
370 |                 
371 |             st.write(response)
372 |             
373 |             #TODO: Add error handling
374 |             
375 | if __name__ == '__main__':
376 |     main()
377 | 


--------------------------------------------------------------------------------
/huxleychat-home.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fredsiika/huxley-pdf/02c82e78f189e55453393f84f2c5bff6967ee77e/huxleychat-home.png


--------------------------------------------------------------------------------
/huxleychat-how-it-works.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fredsiika/huxley-pdf/02c82e78f189e55453393f84f2c5bff6967ee77e/huxleychat-how-it-works.png


--------------------------------------------------------------------------------
/huxleychat-sidebar-apikey.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fredsiika/huxley-pdf/02c82e78f189e55453393f84f2c5bff6967ee77e/huxleychat-sidebar-apikey.png


--------------------------------------------------------------------------------
/huxleychat_banner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fredsiika/huxley-pdf/02c82e78f189e55453393f84f2c5bff6967ee77e/huxleychat_banner.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
  1 | pydeck
  2 | lz4
  3 | certifi
  4 | gunicorn
  5 | anyio
  6 | pyrsistent
  7 | rich
  8 | pymongo
  9 | tiktoken
 10 | asgiref
 11 | websockets
 12 | packaging
 13 | toml
 14 | backoff
 15 | colorama
 16 | djangorestframework
 17 | zipp
 18 | jmespath
 19 | tenacity
 20 | protobuf
 21 | psycopg2-binary
 22 | Pympler
 23 | greenlet
 24 | pandas
 25 | clickhouse-connect
 26 | fastapi
 27 | six
 28 | zstandard
 29 | numpy
 30 | uvicorn
 31 | openapi-schema-pydantic
 32 | markdown-it-py
 33 | boto3
 34 | mpmath
 35 | Pillow
 36 | dnspython
 37 | watchdog
 38 | PyYAML
 39 | smmap
 40 | Pygments
 41 | s3transfer
 42 | botocore
 43 | pytz
 44 | regex
 45 | django-dotenv
 46 | posthog
 47 | scikit-learn
 48 | h11
 49 | requests
 50 | typing_extensions
 51 | jsonschema
 52 | python-dotenv
 53 | typing-inspect
 54 | httptools
 55 | aiosignal
 56 | dataclasses-json
 57 | sentry-sdk
 58 | tzdata
 59 | importlib-metadata
 60 | pyarrow
 61 | validators
 62 | idna
 63 | pinecone-client
 64 | djangorestframework-simplejwt
 65 | sentence-transformers
 66 | torch
 67 | mdurl
 68 | scipy
 69 | tzlocal
 70 | gitdb
 71 | hnswlib
 72 | urllib3
 73 | altair
 74 | frozenlist
 75 | threadpoolctl
 76 | yarl
 77 | multidict
 78 | pytz-deprecation-shim
 79 | starlette
 80 | fsspec
 81 | Jinja2
 82 | async-timeout
 83 | marshmallow
 84 | aiohttp
 85 | attrs
 86 | django-ninja
 87 | nltk
 88 | PyMuPDF
 89 | sqlparse
 90 | joblib
 91 | streamlit-extras
 92 | tornado
 93 | win32-setctime
 94 | pydantic
 95 | sentencepiece
 96 | charset-normalizer
 97 | cachetools
 98 | click
 99 | MarkupSafe
100 | PyJWT
101 | filelock
102 | entrypoints
103 | monotonic
104 | sympy
105 | python-dateutil
106 | tokenizers
107 | sniffio
108 | watchfiles
109 | openai
110 | django-cors-headers
111 | SQLAlchemy
112 | duckdb
113 | networkx
114 | mypy-extensions
115 | toolz
116 | streamlit
117 | Django
118 | blinker
119 | decorator
120 | GitPython
121 | tqdm
122 | torchvision
123 | PyPDF2
124 | langchain
125 | loguru
126 | transformers
127 | marshmallow-enum
128 | faiss-cpu
129 | pdf2image


--------------------------------------------------------------------------------
/templates/condense_prompt.py:
--------------------------------------------------------------------------------
1 | CONDENSE_PROMPT = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question.
2 | 
3 | Chat History:
4 | {chat_history}
5 | Follow Up Input: {question}
6 | Standalone question:"""


--------------------------------------------------------------------------------
/templates/qa_prompt.py:
--------------------------------------------------------------------------------
1 | QA_PROMPT = """You are a helpful AI assistant named HuxleyPDF. Use the following pieces of context to answer the question at the end.
2 | If you don't know the answer, just say you don't know. DO NOT try to make up an answer.
3 | If the question is not related to the context, politely respond that you are tuned to only answer questions that are related to the context.
4 | Use as much detail when as possible when responding.
5 | 
6 | {context}
7 | 
8 | Question: {question}
9 | Helpful answer in markdown format:"""


--------------------------------------------------------------------------------
/utils/ingest.py:
--------------------------------------------------------------------------------
 1 | from langchain.embeddings.openai import OpenAIEmbeddings
 2 | from langchain.document_loaders import DirectoryLoader, PyMuPDFLoader
 3 | from langchain.vectorstores import Pinecone
 4 | import pinecone
 5 | from langchain.text_splitter import RecursiveCharacterTextSplitter
 6 | from langchain.vectorstores import Chroma
 7 | 
 8 | 
 9 | def ingest(openai_api_key, pinecone_api_key, pinecone_environment, pinecone_index, pinecone_namespace, use_pinecone):
10 |     loader = DirectoryLoader('docs', glob="**/*.pdf", loader_cls=PyMuPDFLoader)
11 |     documents = loader.load()
12 | 
13 |     text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=100)
14 |     documents = text_splitter.split_documents(documents)
15 |     embeddings = OpenAIEmbeddings(
16 |         model='text-embedding-ada-002',
17 |         openai_api_key=openai_api_key
18 |     )
19 |     model='text-embedding-ada-002'
20 |     openai_api_key=openai_api_key
21 | 
22 |     if use_pinecone:
23 |         pinecone.init(
24 |             api_key=pinecone_api_key,  # find at app.pinecone.io
25 |             environment=pinecone_environment  # next to api key in console
26 |         )
27 | 
28 |         Pinecone.from_documents(
29 |             documents, embeddings, 
30 |             index_name=pinecone_index, 
31 |             namespace=pinecone_namespace
32 |         )
33 |         
34 |         return 'Finished Ingesting, stored at Pinecone'
35 |     
36 |     else:
37 |         vectorstore = Chroma.from_documents(
38 |             documents, 
39 |             embeddings, 
40 |             collection_name="my_collection", 
41 |             persist_directory="./vectorstore"
42 |         )
43 |         
44 |         return 'Finished Ingesting, stored at ./vectorstore'
45 | 


--------------------------------------------------------------------------------
/utils/query.py:
--------------------------------------------------------------------------------
 1 | from langchain.embeddings.openai import OpenAIEmbeddings
 2 | from langchain.chains import ConversationalRetrievalChain
 3 | from langchain.chat_models import ChatOpenAI
 4 | from langchain.vectorstores import Pinecone
 5 | import pinecone
 6 | from templates.qa_prompt import QA_PROMPT
 7 | from templates.condense_prompt import CONDENSE_PROMPT
 8 | from langchain.vectorstores import Chroma
 9 | 
10 | 
11 | def query(openai_api_key, pinecone_api_key, pinecone_environment, pinecone_index, pinecone_namespace, temperature, sources, use_pinecone):
12 |     embeddings = OpenAIEmbeddings(
13 |         model='text-embedding-ada-002', openai_api_key=openai_api_key)
14 | 
15 |     if use_pinecone:
16 |         pinecone.init(api_key=pinecone_api_key,
17 |                       environment=pinecone_environment)
18 |         vectorstore = Pinecone.from_existing_index(
19 |             index_name=pinecone_index, embedding=embeddings, text_key='text',  namespace=pinecone_namespace)
20 |     else:
21 | 
22 |         # Load in persisted database from disk
23 |         persist_directory = "./vectorstore"
24 |         vectorstore = Chroma(
25 |             persist_directory=persist_directory, embedding_function=embeddings, collection_name="my_collection")
26 | 
27 |     model = ChatOpenAI(model_name='gpt-3.5-turbo', temperature=temperature, 
28 |                        openai_api_key=openai_api_key, streaming=True)  # max temperature is 2 least is 0
29 |     retriever = vectorstore.as_retriever(
30 |         search_kwargs={"k": sources}, 
31 |         qa_template=QA_PROMPT, 
32 |         question_generator_template=CONDENSE_PROMPT
33 |     )  # 9 is the max sources
34 |     
35 |     qa = ConversationalRetrievalChain.from_llm(
36 |         llm=model, 
37 |         retriever=retriever, 
38 |         return_source_documents=True
39 |     )
40 |     return qa
41 | 


--------------------------------------------------------------------------------