├── tests ├── __init__.py ├── test_app.py ├── test_docs │ ├── w28196.pdf │ ├── mirkovic_benzel_teachingcybersecurity.pdf │ ├── Cyber_Security_Threats_in_Cloud_Literature_Review.pdf │ └── Ten_Deadly_Cyber_Security_Threats_Amid_COVID-19_Pa.pdf └── test_vortex_pfd_parser.py ├── app ├── ingest │ ├── __init__.py │ ├── utils.py │ ├── vortext_content_iterator.py │ ├── vortex_ingester.py │ └── vortex_pdf_parser.py ├── query │ ├── __init__.py │ └── vortex_query.py ├── __init__.py ├── logo.png ├── settings.py ├── data │ └── chroma │ │ ├── chroma-collections.parquet │ │ ├── chroma-embeddings.parquet │ │ └── index │ │ ├── index_742cdaf5-f415-4d2a-a87a-daada12ab26d.bin │ │ ├── id_to_uuid_742cdaf5-f415-4d2a-a87a-daada12ab26d.pkl │ │ ├── uuid_to_id_742cdaf5-f415-4d2a-a87a-daada12ab26d.pkl │ │ └── index_metadata_742cdaf5-f415-4d2a-a87a-daada12ab26d.pkl ├── .streamlit │ └── config.toml ├── ingest.py ├── query.py └── streamlit_app.py ├── .flake8 ├── vortex.png ├── docs └── w28196.pdf ├── .dockerignore ├── .github └── workflows │ └── main.yaml ├── LICENSE ├── pyproject.toml ├── README.md └── .gitignore /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /app/ingest/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /app/query/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /app/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.1.0" 2 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 90 3 | count = true -------------------------------------------------------------------------------- /vortex.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PatrickKalkman/python-docuvortex/HEAD/vortex.png -------------------------------------------------------------------------------- /app/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PatrickKalkman/python-docuvortex/HEAD/app/logo.png -------------------------------------------------------------------------------- /app/settings.py: -------------------------------------------------------------------------------- 1 | COLLECTION_NAME = "neonshield-2023-05" 2 | PERSIST_DIRECTORY = "./data/chroma" 3 | -------------------------------------------------------------------------------- /docs/w28196.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PatrickKalkman/python-docuvortex/HEAD/docs/w28196.pdf -------------------------------------------------------------------------------- /tests/test_app.py: -------------------------------------------------------------------------------- 1 | from app import __version__ 2 | 3 | 4 | def test_version(): 5 | assert __version__ == "0.1.0" 6 | -------------------------------------------------------------------------------- /tests/test_docs/w28196.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PatrickKalkman/python-docuvortex/HEAD/tests/test_docs/w28196.pdf -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | *.pyc 3 | *.pyo 4 | *.pyd 5 | *.egg-info 6 | *.github 7 | *.git 8 | *.test 9 | dist 10 | build -------------------------------------------------------------------------------- /app/data/chroma/chroma-collections.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PatrickKalkman/python-docuvortex/HEAD/app/data/chroma/chroma-collections.parquet -------------------------------------------------------------------------------- /app/data/chroma/chroma-embeddings.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PatrickKalkman/python-docuvortex/HEAD/app/data/chroma/chroma-embeddings.parquet -------------------------------------------------------------------------------- /app/.streamlit/config.toml: -------------------------------------------------------------------------------- 1 | [theme] 2 | primaryColor="#60b4ff" 3 | backgroundColor="#FFFFFF" 4 | secondaryBackgroundColor="#F0F2F6" 5 | textColor="#262730" 6 | font="sans serif" -------------------------------------------------------------------------------- /tests/test_docs/mirkovic_benzel_teachingcybersecurity.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PatrickKalkman/python-docuvortex/HEAD/tests/test_docs/mirkovic_benzel_teachingcybersecurity.pdf -------------------------------------------------------------------------------- /app/data/chroma/index/index_742cdaf5-f415-4d2a-a87a-daada12ab26d.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PatrickKalkman/python-docuvortex/HEAD/app/data/chroma/index/index_742cdaf5-f415-4d2a-a87a-daada12ab26d.bin -------------------------------------------------------------------------------- /tests/test_docs/Cyber_Security_Threats_in_Cloud_Literature_Review.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PatrickKalkman/python-docuvortex/HEAD/tests/test_docs/Cyber_Security_Threats_in_Cloud_Literature_Review.pdf -------------------------------------------------------------------------------- /tests/test_docs/Ten_Deadly_Cyber_Security_Threats_Amid_COVID-19_Pa.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PatrickKalkman/python-docuvortex/HEAD/tests/test_docs/Ten_Deadly_Cyber_Security_Threats_Amid_COVID-19_Pa.pdf -------------------------------------------------------------------------------- /app/data/chroma/index/id_to_uuid_742cdaf5-f415-4d2a-a87a-daada12ab26d.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PatrickKalkman/python-docuvortex/HEAD/app/data/chroma/index/id_to_uuid_742cdaf5-f415-4d2a-a87a-daada12ab26d.pkl -------------------------------------------------------------------------------- /app/data/chroma/index/uuid_to_id_742cdaf5-f415-4d2a-a87a-daada12ab26d.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PatrickKalkman/python-docuvortex/HEAD/app/data/chroma/index/uuid_to_id_742cdaf5-f415-4d2a-a87a-daada12ab26d.pkl -------------------------------------------------------------------------------- /app/data/chroma/index/index_metadata_742cdaf5-f415-4d2a-a87a-daada12ab26d.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PatrickKalkman/python-docuvortex/HEAD/app/data/chroma/index/index_metadata_742cdaf5-f415-4d2a-a87a-daada12ab26d.pkl -------------------------------------------------------------------------------- /app/ingest.py: -------------------------------------------------------------------------------- 1 | from dotenv import load_dotenv 2 | 3 | from ingest.vortex_ingester import VortexIngester 4 | 5 | load_dotenv() 6 | 7 | 8 | def main(): 9 | ingester = VortexIngester("../docs/") 10 | ingester.ingest() 11 | 12 | 13 | if __name__ == "__main__": 14 | main() 15 | -------------------------------------------------------------------------------- /app/ingest/utils.py: -------------------------------------------------------------------------------- 1 | def getattr_or_default(obj, attr, default=None): 2 | """Get an attribute from an object, returning a default value if the attribute """ 3 | """is not found or its value is None.""" 4 | value = getattr(obj, attr, default) 5 | return value if value is not None else default 6 | -------------------------------------------------------------------------------- /app/ingest/vortext_content_iterator.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | class VortexContentIterator(): 5 | def __init__(self, content_path): 6 | self.content_path = content_path 7 | 8 | def __iter__(self): 9 | for file in os.listdir(self.content_path): 10 | if file.endswith(".pdf"): 11 | yield os.path.join(self.content_path, file) 12 | -------------------------------------------------------------------------------- /app/query.py: -------------------------------------------------------------------------------- 1 | from query.vortex_query import VortexQuery 2 | 3 | 4 | def main(): 5 | vortex_query = VortexQuery() 6 | 7 | while True: 8 | print() 9 | question = input("Question: ") 10 | 11 | answer, source = vortex_query.ask_question(question) 12 | 13 | print("\n\nSources:\n") 14 | for document in source: 15 | print(f"Page: {document.metadata['page_number']}") 16 | print(f"Text chunk: {document.page_content[:160]}...\n") 17 | print(f"Answer: {answer}") 18 | 19 | 20 | if __name__ == "__main__": 21 | main() 22 | -------------------------------------------------------------------------------- /.github/workflows/main.yaml: -------------------------------------------------------------------------------- 1 | name: Python CI 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | pull_request: 8 | branches: 9 | - master 10 | 11 | jobs: 12 | build-and-deploy: 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - name: Check out code 17 | uses: actions/checkout@v3 18 | 19 | - name: Set up Python 20 | uses: actions/setup-python@v3 21 | with: 22 | python-version: 3.11.3 23 | 24 | - name: Install poetry 25 | run: | 26 | python -m pip install poetry==1.4.2 27 | 28 | - name: Configure poetry 29 | run: | 30 | python -m poetry config virtualenvs.in-project true 31 | 32 | - name: Cache the virtualenv 33 | uses: actions/cache@v3 34 | with: 35 | path: ./.venv 36 | key: ${{ runner.os }}-venv-${{ hashFiles('**/poetry.lock') }} 37 | 38 | - name: Install dependencies 39 | run: | 40 | python -m poetry install 41 | 42 | - name: Run test 43 | run: | 44 | poetry run pytest 45 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Patrick Kalkman 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "python-docuvortex" 3 | version = "0.1.0" 4 | description = "" 5 | authors = ["Patrick Kalkman "] 6 | license = "MIT" 7 | readme = "README.md" 8 | packages = [{include = "app"}] 9 | 10 | [tool.poetry.dependencies] 11 | python = "^3.11" 12 | openapi = "^1.1.0" 13 | langchain = "^0.0.177" 14 | python-dotenv = "^1.0.0" 15 | openai = "^0.27.7" 16 | tiktoken = "^0.4.0" 17 | chromadb = "^0.3.25" 18 | pdfplumber = "^0.9.0" 19 | loguru = "^0.7.0" 20 | pypdf = "^3.9.0" 21 | streamlit = "^1.22.0" 22 | streamlit-chat = "^0.0.2.2" 23 | streamlit-extras = "^0.2.7" 24 | pillow = "^9.5.0" 25 | 26 | [tool.poetry.group.dev.dependencies] 27 | mypy = "^1.3.0" 28 | bandit = "^1.7.5" 29 | pytest = "^7.3.1" 30 | 31 | [tool.bandit] 32 | exclude_dirs = ["tests",] 33 | 34 | [tool.pytest.ini_options] 35 | filterwarnings = ["ignore::Warning"] 36 | log_cli = true 37 | log_cli_level = "INFO" 38 | log_cli_format = "%(asctime)s [%(levelname)8s] %(message)s (%(filename)s:%(lineno)s)" 39 | log_cli_date_format = "%Y-%m-%d %H:%M:%S" 40 | 41 | [build-system] 42 | requires = ["poetry-core"] 43 | build-backend = "poetry.core.masonry.api" 44 | -------------------------------------------------------------------------------- /app/ingest/vortex_ingester.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from langchain.embeddings import OpenAIEmbeddings 3 | from langchain.vectorstores import Chroma 4 | import langchain.docstore.document as docstore 5 | from loguru import logger 6 | 7 | from settings import COLLECTION_NAME, PERSIST_DIRECTORY 8 | 9 | from .vortex_pdf_parser import VortexPdfParser 10 | from .vortext_content_iterator import VortexContentIterator 11 | 12 | 13 | class VortexIngester: 14 | 15 | def __init__(self, content_folder: str): 16 | self.content_folder = content_folder 17 | 18 | def ingest(self) -> None: 19 | vortex_content_iterator = VortexContentIterator(self.content_folder) 20 | vortex_pdf_parser = VortexPdfParser() 21 | 22 | chunks: List[docstore.Document] = [] 23 | for document in vortex_content_iterator: 24 | vortex_pdf_parser.set_pdf_file_path(document) 25 | document_chunks = vortex_pdf_parser.clean_text_to_docs() 26 | chunks.extend(document_chunks) 27 | logger.info(f"Extracted {len(chunks)} chunks from {document}") 28 | 29 | embeddings = OpenAIEmbeddings(client=None) 30 | logger.info("Loaded embeddings") 31 | vector_store = Chroma.from_documents( 32 | chunks, 33 | embeddings, 34 | collection_name=COLLECTION_NAME, 35 | persist_directory=PERSIST_DIRECTORY, 36 | ) 37 | 38 | logger.info("Created Chroma vector store") 39 | vector_store.persist() 40 | logger.info("Persisted Chroma vector store") 41 | -------------------------------------------------------------------------------- /app/query/vortex_query.py: -------------------------------------------------------------------------------- 1 | from dotenv import load_dotenv 2 | from langchain.chains import ConversationalRetrievalChain 3 | from langchain.chat_models import ChatOpenAI 4 | from langchain.embeddings.openai import OpenAIEmbeddings 5 | from langchain.schema import AIMessage, HumanMessage 6 | from langchain.vectorstores.chroma import Chroma 7 | 8 | from settings import COLLECTION_NAME, PERSIST_DIRECTORY 9 | 10 | 11 | class VortexQuery: 12 | def __init__(self): 13 | load_dotenv() 14 | self.chain = self.make_chain() 15 | self.chat_history = [] 16 | 17 | def make_chain(self): 18 | model = ChatOpenAI( 19 | client=None, 20 | model="gpt-3.5-turbo", 21 | temperature=0, 22 | ) 23 | embedding = OpenAIEmbeddings(client=None) 24 | 25 | vector_store = Chroma( 26 | collection_name=COLLECTION_NAME, 27 | embedding_function=embedding, 28 | persist_directory=PERSIST_DIRECTORY, 29 | ) 30 | 31 | return ConversationalRetrievalChain.from_llm( 32 | model, 33 | retriever=vector_store.as_retriever(), 34 | return_source_documents=True, 35 | ) 36 | 37 | def ask_question(self, question: str): 38 | response = self.chain({"question": question, "chat_history": self.chat_history}) 39 | 40 | answer = response["answer"] 41 | source = response["source_documents"] 42 | self.chat_history.append(HumanMessage(content=question)) 43 | self.chat_history.append(AIMessage(content=answer)) 44 | 45 | return answer, source 46 | -------------------------------------------------------------------------------- /app/streamlit_app.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | from streamlit_chat import message 3 | from PIL import Image 4 | from query.vortex_query import VortexQuery 5 | 6 | 7 | def initialize_page(): 8 | st.set_page_config(page_title='DocuVortex', page_icon=':books:') 9 | st.image(logo_image, width=80) 10 | st.header("NeonShield DocuVortex") 11 | st.markdown("[Github](https://github.com/pkalkman/python-docuvortex)") 12 | 13 | 14 | def handle_query_form(): 15 | with st.form(key='query_form'): 16 | user_query = st.text_input('Search for: ', '', key='input', 17 | help='Enter your search query?') 18 | submit_button = st.form_submit_button('Submit') 19 | return user_query, submit_button 20 | 21 | 22 | def display_chat_history(): 23 | for i, (user_msg, ai_msg) in enumerate(zip(st.session_state['past'][::-1], 24 | st.session_state['generated'][::-1])): 25 | message(user_msg, is_user=True, key=f"user_{i}") 26 | message(ai_msg, key=f"ai_{i}") 27 | 28 | 29 | def query(question: str) -> str: 30 | """ 31 | Query the VortexQuery model with the provided question 32 | :param question: The question to ask the model 33 | :return: The answer from the model 34 | """ 35 | vortex_query = VortexQuery() 36 | answer, _ = vortex_query.ask_question(question) 37 | return answer 38 | 39 | 40 | logo_image = Image.open('./logo.png') 41 | 42 | # Initialize page and session state 43 | st.session_state.setdefault('generated', []) 44 | st.session_state.setdefault('past', []) 45 | 46 | initialize_page() 47 | user_query, submit_button = handle_query_form() 48 | 49 | if submit_button and user_query: 50 | model_response = query(user_query) 51 | st.session_state.past.append(user_query) 52 | st.session_state.generated.append(model_response) 53 | 54 | display_chat_history() 55 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Harnessing the Vortex: Building a Document-Based Q&A System Using OpenAI and Python 2 | 3 | ## Leveraging the Power of Large Language Models and the Langchain Framework for an Innovative Approach to Document Querying 4 | 5 | ![DocuVortex](/vortex.png "DocuVortex") 6 | 7 | 8 | This project aims to implement a document-based question-answering system using the power of OpenAI's GPT-3.5 Turbo model, Python, and the Langchain Framework. It processes PDF documents, breaking them into ingestible chunks, and then stores these chunks into a Chroma DB vector database for querying. It complements a Medium article called [Howto Build a Document-Based Q&A System Using OpenAI and Python](https://medium.com/itnext/how-to-build-a-document-based-q-a-system-using-openai-and-python-17d1c3cc2081). 9 | 10 | ## Getting Started 11 | 12 | These instructions will get you a copy of the project up and running on your local machine for development and testing purposes. 13 | 14 | ### Prerequisites 15 | 16 | To install the project, you need to have [Python](https://www.python.org/downloads/) installed on your machine. 17 | 18 | ### Installing 19 | 20 | The project uses [Poetry](https://python-poetry.org/) for managing dependencies. After cloning the repository, navigate to the project directory and install dependencies with the following commands: 21 | 22 | ```bash 23 | poetry install 24 | poetry shell 25 | ``` 26 | 27 | ## Running the Application 28 | Before you can run ingesting or querying you have to make sure that a .env file exists. This file should have a single line that read ```OPENAI_API_KEY=yourkey``` 29 | 30 | ### Ingesting Documents 31 | To ingest documents, place your PDF files in the 'docs' folder make sure that you are in the app folder and run the following command: 32 | 33 | ```bash 34 | cd app 35 | python ingest.py 36 | ``` 37 | 38 | ### Querying Documents 39 | To query the ingested documents, make sure that you are in the app folder, run the following command and follow the interactive prompts: 40 | 41 | ```bash 42 | cd app 43 | python query.py 44 | ``` 45 | 46 | ### Running the Streamlit App 47 | To visualize and interact with the system via the Streamlit app, run the following command: 48 | 49 | ```bash 50 | streamlit run streamlit_app.py 51 | ``` 52 | 53 | ### Authors 54 | [Patrick Kalkman](https://github.com/PatrickKalkman) 55 | 56 | ### License 57 | This project is licensed under the MIT license - see the LICENSE.md file for details 58 | 59 | ### Acknowledgments 60 | - [Langchain Framework](https://python.langchain.com/en/latest/index.html) 61 | - [OpenAI](https://openai.com/) 62 | - [Chroma DB](https://www.trychroma.com/) 63 | - [Streamlit](https://streamlit.io/) 64 | 65 | 66 | -------------------------------------------------------------------------------- /tests/test_vortex_pfd_parser.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from app.ingest.vortex_pdf_parser import VortexPdfParser 4 | 5 | 6 | @pytest.fixture 7 | def pdf_file(): 8 | return "./tests/test_docs/Cyber_Security_Threats_in_Cloud_Literature_Review.pdf" 9 | 10 | 11 | pdf_test_cases = [ 12 | ("./tests/test_docs/Cyber_Security_Threats_in_Cloud_Literature_Review.pdf", 13 | "Cyber Security Threats in Cloud: Literature Review", "Almaiah", "2021-06-15"), 14 | ("./tests/test_docs/mirkovic_benzel_teachingcybersecurity.pdf", 15 | "", "", ""), 16 | ("./tests/test_docs/w28196.pdf", 17 | "Cybersecurity Risk", "", "") 18 | ] 19 | 20 | 21 | @pytest.mark.parametrize("pdf_file,title,author,creation_date", pdf_test_cases) 22 | def test_extract_metadata_from_pdf(pdf_file, title, author, creation_date): 23 | parser = VortexPdfParser() 24 | parser.set_pdf_file_path(pdf_file) 25 | metadata = parser.extract_metadata_from_pdf() 26 | assert metadata['title'] == title 27 | assert author in metadata['author'] 28 | assert creation_date in metadata['creation_date'] 29 | 30 | 31 | def test_extract_pages_from_pdf(pdf_file): 32 | parser = VortexPdfParser() 33 | parser.set_pdf_file_path(pdf_file) 34 | pages = parser.extract_pages_from_pdf() 35 | assert (len(pages) == 8) 36 | assert ('collection of most\n352 studies are remained' in pages[0][1]) 37 | 38 | 39 | def test_parse_pdf(pdf_file): 40 | parser = VortexPdfParser() 41 | parser.set_pdf_file_path(pdf_file) 42 | pages, metadata = parser.parse_pdf() 43 | assert (metadata['title'] == "Cyber Security Threats in Cloud: Literature Review") 44 | assert ('Almaiah' in metadata['author']) 45 | assert ('2021-06-15' in metadata['creation_date']) 46 | assert (len(pages) == 8) 47 | assert ('collection of most\n352 studies are remained' in pages[0][1]) 48 | 49 | 50 | def test_clean_text(pdf_file): 51 | parser = VortexPdfParser() 52 | parser.set_pdf_file_path(pdf_file) 53 | pages = parser.extract_pages_from_pdf() 54 | cleaning_functions = [parser.merge_hyphenated_words, parser.fix_newlines, 55 | parser.remove_multiple_newlines] 56 | cleaned_pages = parser.clean_text(pages, cleaning_functions) 57 | assert (len(cleaned_pages) == 8) 58 | assert ('collection of most 352 studies are remained' in cleaned_pages[0][1]) 59 | 60 | 61 | def test_text_to_docs(pdf_file): 62 | parser = VortexPdfParser() 63 | parser.set_pdf_file_path(pdf_file) 64 | pages = parser.extract_pages_from_pdf() 65 | metadata = parser.extract_metadata_from_pdf() 66 | docs = parser.text_to_docs(pages, metadata) 67 | assert ('In recent years, data has been expanding' in docs[0].page_content) 68 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | **/.DS_Store 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | share/python-wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .nox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | *.py,cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | cover/ 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | local_settings.py 63 | db.sqlite3 64 | db.sqlite3-journal 65 | 66 | # Flask stuff: 67 | instance/ 68 | .webassets-cache 69 | 70 | # Scrapy stuff: 71 | .scrapy 72 | 73 | # Sphinx documentation 74 | docs/_build/ 75 | 76 | # PyBuilder 77 | .pybuilder/ 78 | target/ 79 | 80 | # Jupyter Notebook 81 | .ipynb_checkpoints 82 | 83 | # IPython 84 | profile_default/ 85 | ipython_config.py 86 | 87 | # pyenv 88 | # For a library or package, you might want to ignore these files since the code is 89 | # intended to run in multiple environments; otherwise, check them in: 90 | # .python-version 91 | 92 | # pipenv 93 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 94 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 95 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 96 | # install all needed dependencies. 97 | #Pipfile.lock 98 | 99 | # poetry 100 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 101 | # This is especially recommended for binary packages to ensure reproducibility, and is more 102 | # commonly ignored for libraries. 103 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 104 | #poetry.lock 105 | 106 | # pdm 107 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 108 | #pdm.lock 109 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 110 | # in version control. 111 | # https://pdm.fming.dev/#use-with-ide 112 | .pdm.toml 113 | 114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 115 | __pypackages__/ 116 | 117 | # Celery stuff 118 | celerybeat-schedule 119 | celerybeat.pid 120 | 121 | # SageMath parsed files 122 | *.sage.py 123 | 124 | # Environments 125 | .env 126 | .venv 127 | env/ 128 | venv/ 129 | ENV/ 130 | env.bak/ 131 | venv.bak/ 132 | 133 | # Spyder project settings 134 | .spyderproject 135 | .spyproject 136 | 137 | # Rope project settings 138 | .ropeproject 139 | 140 | # mkdocs documentation 141 | /site 142 | 143 | # mypy 144 | .mypy_cache/ 145 | .dmypy.json 146 | dmypy.json 147 | 148 | # Pyre type checker 149 | .pyre/ 150 | 151 | # pytype static type analyzer 152 | .pytype/ 153 | 154 | # Cython debug symbols 155 | cython_debug/ 156 | 157 | # PyCharm 158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 160 | # and can be added to the global gitignore or merged into this file. For a more nuclear 161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 162 | #.idea/ 163 | -------------------------------------------------------------------------------- /app/ingest/vortex_pdf_parser.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | from datetime import date 4 | from typing import Callable, Dict, List, Tuple 5 | 6 | import langchain.docstore.document as docstore 7 | import langchain.text_splitter as splitter 8 | import pdfplumber 9 | from loguru import logger 10 | from pypdf import PdfReader 11 | 12 | from .utils import getattr_or_default 13 | 14 | 15 | class VortexPdfParser: 16 | """A parser for extracting and cleaning text from PDF documents.""" 17 | 18 | def set_pdf_file_path(self, pdf_file_path: str): 19 | """Set the path to the PDF file.""" 20 | if not os.path.isfile(pdf_file_path): 21 | raise FileNotFoundError(f"File not found: {pdf_file_path}") 22 | self.pdf_file_path = pdf_file_path 23 | 24 | def clean_text_to_docs(self) -> List[docstore.Document]: 25 | raw_pages, metadata = self.parse_pdf() 26 | 27 | cleaning_functions: List = [ 28 | self.merge_hyphenated_words, 29 | self.fix_newlines, 30 | self.remove_multiple_newlines, 31 | ] 32 | 33 | cleaned_text_pdf = self.clean_text(raw_pages, cleaning_functions) 34 | return self.text_to_docs(cleaned_text_pdf, metadata) 35 | 36 | def parse_pdf(self) -> Tuple[List[Tuple[int, str]], Dict[str, str]]: 37 | """Extract and return the pages and metadata from the PDF.""" 38 | metadata = self.extract_metadata_from_pdf() 39 | pages = self.extract_pages_from_pdf() 40 | return pages, metadata 41 | 42 | def extract_metadata_from_pdf(self) -> Dict[str, str]: 43 | """Extract and return the metadata from the PDF.""" 44 | logger.info("Extracting metadata") 45 | with open(self.pdf_file_path, "rb") as pdf_file: 46 | reader = PdfReader(pdf_file) 47 | metadata = reader.metadata 48 | logger.info(f"{getattr(metadata, 'title', 'no title')}") 49 | default_date = date(1900, 1, 1) 50 | return { 51 | "title": getattr_or_default(metadata, 'title', '').strip(), 52 | "author": getattr_or_default(metadata, 'author', '').strip(), 53 | "creation_date": getattr_or_default(metadata, 54 | 'creation_date', 55 | default_date).strftime('%Y-%m-%d'), 56 | } 57 | 58 | def extract_pages_from_pdf(self) -> List[Tuple[int, str]]: 59 | """Extract and return the text of each page from the PDF.""" 60 | logger.info("Extracting pages") 61 | with pdfplumber.open(self.pdf_file_path) as pdf: 62 | return [(i + 1, p.extract_text()) 63 | for i, p in enumerate(pdf.pages) if p.extract_text().strip()] 64 | 65 | def clean_text(self, 66 | pages: List[Tuple[int, str]], 67 | cleaning_functions: List[Callable[[str], str]] 68 | ) -> List[Tuple[int, str]]: 69 | """Apply the cleaning functions to the text of each page.""" 70 | logger.info("Cleaning text of each page") 71 | cleaned_pages = [] 72 | for page_num, text in pages: 73 | for cleaning_function in cleaning_functions: 74 | text = cleaning_function(text) 75 | cleaned_pages.append((page_num, text)) 76 | return cleaned_pages 77 | 78 | def merge_hyphenated_words(self, text: str) -> str: 79 | """Merge words in the text that have been split with a hyphen.""" 80 | return re.sub(r"(\w)-\n(\w)", r"\1\2", text) 81 | 82 | def fix_newlines(self, text: str) -> str: 83 | """Replace single newline characters in the text with spaces.""" 84 | return re.sub(r"(? str: 87 | """Reduce multiple newline characters in the text to a single newline.""" 88 | return re.sub(r"\n{2,}", "\n", text) 89 | 90 | def text_to_docs(self, text: List[Tuple[int, str]], 91 | metadata: Dict[str, str]) -> List[docstore.Document]: 92 | """Split the text into chunks and return them as Documents.""" 93 | doc_chunks: List[docstore.Document] = [] 94 | 95 | for page_num, page in text: 96 | logger.info(f"Splitting page {page_num}") 97 | text_splitter = splitter.RecursiveCharacterTextSplitter( 98 | chunk_size=1000, 99 | separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""], 100 | chunk_overlap=200, 101 | ) 102 | chunks = text_splitter.split_text(page) 103 | for i, chunk in enumerate(chunks): 104 | doc = docstore.Document( 105 | page_content=chunk, 106 | metadata={ 107 | "page_number": page_num, 108 | "chunk": i, 109 | "source": f"p{page_num}-{i}", 110 | **metadata, 111 | }, 112 | ) 113 | doc_chunks.append(doc) 114 | return doc_chunks 115 | --------------------------------------------------------------------------------