├── tests
    ├── __init__.py
    ├── test_app.py
    ├── test_docs
    │   ├── w28196.pdf
    │   ├── mirkovic_benzel_teachingcybersecurity.pdf
    │   ├── Cyber_Security_Threats_in_Cloud_Literature_Review.pdf
    │   └── Ten_Deadly_Cyber_Security_Threats_Amid_COVID-19_Pa.pdf
    └── test_vortex_pfd_parser.py
├── app
    ├── ingest
    │   ├── __init__.py
    │   ├── utils.py
    │   ├── vortext_content_iterator.py
    │   ├── vortex_ingester.py
    │   └── vortex_pdf_parser.py
    ├── query
    │   ├── __init__.py
    │   └── vortex_query.py
    ├── __init__.py
    ├── logo.png
    ├── settings.py
    ├── data
    │   └── chroma
    │   │   ├── chroma-collections.parquet
    │   │   ├── chroma-embeddings.parquet
    │   │   └── index
    │   │       ├── index_742cdaf5-f415-4d2a-a87a-daada12ab26d.bin
    │   │       ├── id_to_uuid_742cdaf5-f415-4d2a-a87a-daada12ab26d.pkl
    │   │       ├── uuid_to_id_742cdaf5-f415-4d2a-a87a-daada12ab26d.pkl
    │   │       └── index_metadata_742cdaf5-f415-4d2a-a87a-daada12ab26d.pkl
    ├── .streamlit
    │   └── config.toml
    ├── ingest.py
    ├── query.py
    └── streamlit_app.py
├── .flake8
├── vortex.png
├── docs
    └── w28196.pdf
├── .dockerignore
├── .github
    └── workflows
    │   └── main.yaml
├── LICENSE
├── pyproject.toml
├── README.md
└── .gitignore


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/app/ingest/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/app/query/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/app/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.1.0"
2 | 


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 90
3 | count = true


--------------------------------------------------------------------------------
/vortex.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PatrickKalkman/python-docuvortex/HEAD/vortex.png


--------------------------------------------------------------------------------
/app/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PatrickKalkman/python-docuvortex/HEAD/app/logo.png


--------------------------------------------------------------------------------
/app/settings.py:
--------------------------------------------------------------------------------
1 | COLLECTION_NAME = "neonshield-2023-05"
2 | PERSIST_DIRECTORY = "./data/chroma"
3 | 


--------------------------------------------------------------------------------
/docs/w28196.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PatrickKalkman/python-docuvortex/HEAD/docs/w28196.pdf


--------------------------------------------------------------------------------
/tests/test_app.py:
--------------------------------------------------------------------------------
1 | from app import __version__
2 | 
3 | 
4 | def test_version():
5 |     assert __version__ == "0.1.0"
6 | 


--------------------------------------------------------------------------------
/tests/test_docs/w28196.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PatrickKalkman/python-docuvortex/HEAD/tests/test_docs/w28196.pdf


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
 1 | __pycache__
 2 | *.pyc
 3 | *.pyo
 4 | *.pyd
 5 | *.egg-info
 6 | *.github
 7 | *.git
 8 | *.test
 9 | dist
10 | build


--------------------------------------------------------------------------------
/app/data/chroma/chroma-collections.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PatrickKalkman/python-docuvortex/HEAD/app/data/chroma/chroma-collections.parquet


--------------------------------------------------------------------------------
/app/data/chroma/chroma-embeddings.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PatrickKalkman/python-docuvortex/HEAD/app/data/chroma/chroma-embeddings.parquet


--------------------------------------------------------------------------------
/app/.streamlit/config.toml:
--------------------------------------------------------------------------------
1 | [theme]
2 | primaryColor="#60b4ff"
3 | backgroundColor="#FFFFFF"
4 | secondaryBackgroundColor="#F0F2F6"
5 | textColor="#262730"
6 | font="sans serif"


--------------------------------------------------------------------------------
/tests/test_docs/mirkovic_benzel_teachingcybersecurity.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PatrickKalkman/python-docuvortex/HEAD/tests/test_docs/mirkovic_benzel_teachingcybersecurity.pdf


--------------------------------------------------------------------------------
/app/data/chroma/index/index_742cdaf5-f415-4d2a-a87a-daada12ab26d.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PatrickKalkman/python-docuvortex/HEAD/app/data/chroma/index/index_742cdaf5-f415-4d2a-a87a-daada12ab26d.bin


--------------------------------------------------------------------------------
/tests/test_docs/Cyber_Security_Threats_in_Cloud_Literature_Review.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PatrickKalkman/python-docuvortex/HEAD/tests/test_docs/Cyber_Security_Threats_in_Cloud_Literature_Review.pdf


--------------------------------------------------------------------------------
/tests/test_docs/Ten_Deadly_Cyber_Security_Threats_Amid_COVID-19_Pa.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PatrickKalkman/python-docuvortex/HEAD/tests/test_docs/Ten_Deadly_Cyber_Security_Threats_Amid_COVID-19_Pa.pdf


--------------------------------------------------------------------------------
/app/data/chroma/index/id_to_uuid_742cdaf5-f415-4d2a-a87a-daada12ab26d.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PatrickKalkman/python-docuvortex/HEAD/app/data/chroma/index/id_to_uuid_742cdaf5-f415-4d2a-a87a-daada12ab26d.pkl


--------------------------------------------------------------------------------
/app/data/chroma/index/uuid_to_id_742cdaf5-f415-4d2a-a87a-daada12ab26d.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PatrickKalkman/python-docuvortex/HEAD/app/data/chroma/index/uuid_to_id_742cdaf5-f415-4d2a-a87a-daada12ab26d.pkl


--------------------------------------------------------------------------------
/app/data/chroma/index/index_metadata_742cdaf5-f415-4d2a-a87a-daada12ab26d.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PatrickKalkman/python-docuvortex/HEAD/app/data/chroma/index/index_metadata_742cdaf5-f415-4d2a-a87a-daada12ab26d.pkl


--------------------------------------------------------------------------------
/app/ingest.py:
--------------------------------------------------------------------------------
 1 | from dotenv import load_dotenv
 2 | 
 3 | from ingest.vortex_ingester import VortexIngester
 4 | 
 5 | load_dotenv()
 6 | 
 7 | 
 8 | def main():
 9 |     ingester = VortexIngester("../docs/")
10 |     ingester.ingest()
11 | 
12 | 
13 | if __name__ == "__main__":
14 |     main()
15 | 


--------------------------------------------------------------------------------
/app/ingest/utils.py:
--------------------------------------------------------------------------------
1 | def getattr_or_default(obj, attr, default=None):
2 |     """Get an attribute from an object, returning a default value if the attribute """
3 |     """is not found or its value is None."""
4 |     value = getattr(obj, attr, default)
5 |     return value if value is not None else default
6 | 


--------------------------------------------------------------------------------
/app/ingest/vortext_content_iterator.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | 
 4 | class VortexContentIterator():
 5 |     def __init__(self, content_path):
 6 |         self.content_path = content_path
 7 | 
 8 |     def __iter__(self):
 9 |         for file in os.listdir(self.content_path):
10 |             if file.endswith(".pdf"):
11 |                 yield os.path.join(self.content_path, file)
12 | 


--------------------------------------------------------------------------------
/app/query.py:
--------------------------------------------------------------------------------
 1 | from query.vortex_query import VortexQuery
 2 | 
 3 | 
 4 | def main():
 5 |     vortex_query = VortexQuery()
 6 | 
 7 |     while True:
 8 |         print()
 9 |         question = input("Question: ")
10 | 
11 |         answer, source = vortex_query.ask_question(question)
12 | 
13 |         print("\n\nSources:\n")
14 |         for document in source:
15 |             print(f"Page: {document.metadata['page_number']}")
16 |             print(f"Text chunk: {document.page_content[:160]}...\n")
17 |         print(f"Answer: {answer}")
18 | 
19 | 
20 | if __name__ == "__main__":
21 |     main()
22 | 


--------------------------------------------------------------------------------
/.github/workflows/main.yaml:
--------------------------------------------------------------------------------
 1 | name: Python CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 |   pull_request:
 8 |     branches:
 9 |       - master
10 | 
11 | jobs:
12 |   build-and-deploy:
13 |     runs-on: ubuntu-latest
14 | 
15 |     steps:
16 |     - name: Check out code
17 |       uses: actions/checkout@v3
18 | 
19 |     - name: Set up Python
20 |       uses: actions/setup-python@v3
21 |       with:
22 |         python-version: 3.11.3
23 | 
24 |     - name: Install poetry
25 |       run: |
26 |         python -m pip install poetry==1.4.2
27 | 
28 |     - name: Configure poetry
29 |       run: |
30 |         python -m poetry config virtualenvs.in-project true
31 | 
32 |     - name: Cache the virtualenv
33 |       uses: actions/cache@v3
34 |       with:
35 |         path: ./.venv
36 |         key: ${{ runner.os }}-venv-${{ hashFiles('**/poetry.lock') }}
37 | 
38 |     - name: Install dependencies
39 |       run: |
40 |         python -m poetry install
41 | 
42 |     - name: Run test
43 |       run: |
44 |         poetry run pytest
45 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Patrick Kalkman
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "python-docuvortex"
 3 | version = "0.1.0"
 4 | description = ""
 5 | authors = ["Patrick Kalkman <patrick@simpletechture.nl>"]
 6 | license = "MIT"
 7 | readme = "README.md"
 8 | packages = [{include = "app"}]
 9 | 
10 | [tool.poetry.dependencies]
11 | python = "^3.11"
12 | openapi = "^1.1.0"
13 | langchain = "^0.0.177"
14 | python-dotenv = "^1.0.0"
15 | openai = "^0.27.7"
16 | tiktoken = "^0.4.0"
17 | chromadb = "^0.3.25"
18 | pdfplumber = "^0.9.0"
19 | loguru = "^0.7.0"
20 | pypdf = "^3.9.0"
21 | streamlit = "^1.22.0"
22 | streamlit-chat = "^0.0.2.2"
23 | streamlit-extras = "^0.2.7"
24 | pillow = "^9.5.0"
25 | 
26 | [tool.poetry.group.dev.dependencies]
27 | mypy = "^1.3.0"
28 | bandit = "^1.7.5"
29 | pytest = "^7.3.1"
30 | 
31 | [tool.bandit]
32 | exclude_dirs = ["tests",]
33 | 
34 | [tool.pytest.ini_options]
35 | filterwarnings = ["ignore::Warning"]
36 | log_cli = true
37 | log_cli_level = "INFO"
38 | log_cli_format = "%(asctime)s [%(levelname)8s] %(message)s (%(filename)s:%(lineno)s)"
39 | log_cli_date_format = "%Y-%m-%d %H:%M:%S"
40 | 
41 | [build-system]
42 | requires = ["poetry-core"]
43 | build-backend = "poetry.core.masonry.api"
44 | 


--------------------------------------------------------------------------------
/app/ingest/vortex_ingester.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | from langchain.embeddings import OpenAIEmbeddings
 3 | from langchain.vectorstores import Chroma
 4 | import langchain.docstore.document as docstore
 5 | from loguru import logger
 6 | 
 7 | from settings import COLLECTION_NAME, PERSIST_DIRECTORY
 8 | 
 9 | from .vortex_pdf_parser import VortexPdfParser
10 | from .vortext_content_iterator import VortexContentIterator
11 | 
12 | 
13 | class VortexIngester:
14 | 
15 |     def __init__(self, content_folder: str):
16 |         self.content_folder = content_folder
17 | 
18 |     def ingest(self) -> None:
19 |         vortex_content_iterator = VortexContentIterator(self.content_folder)
20 |         vortex_pdf_parser = VortexPdfParser()
21 | 
22 |         chunks: List[docstore.Document] = []
23 |         for document in vortex_content_iterator:
24 |             vortex_pdf_parser.set_pdf_file_path(document)
25 |             document_chunks = vortex_pdf_parser.clean_text_to_docs()
26 |             chunks.extend(document_chunks)
27 |             logger.info(f"Extracted {len(chunks)} chunks from {document}")
28 | 
29 |         embeddings = OpenAIEmbeddings(client=None)
30 |         logger.info("Loaded embeddings")
31 |         vector_store = Chroma.from_documents(
32 |             chunks,
33 |             embeddings,
34 |             collection_name=COLLECTION_NAME,
35 |             persist_directory=PERSIST_DIRECTORY,
36 |         )
37 | 
38 |         logger.info("Created Chroma vector store")
39 |         vector_store.persist()
40 |         logger.info("Persisted Chroma vector store")
41 | 


--------------------------------------------------------------------------------
/app/query/vortex_query.py:
--------------------------------------------------------------------------------
 1 | from dotenv import load_dotenv
 2 | from langchain.chains import ConversationalRetrievalChain
 3 | from langchain.chat_models import ChatOpenAI
 4 | from langchain.embeddings.openai import OpenAIEmbeddings
 5 | from langchain.schema import AIMessage, HumanMessage
 6 | from langchain.vectorstores.chroma import Chroma
 7 | 
 8 | from settings import COLLECTION_NAME, PERSIST_DIRECTORY
 9 | 
10 | 
11 | class VortexQuery:
12 |     def __init__(self):
13 |         load_dotenv()
14 |         self.chain = self.make_chain()
15 |         self.chat_history = []
16 | 
17 |     def make_chain(self):
18 |         model = ChatOpenAI(
19 |             client=None,
20 |             model="gpt-3.5-turbo",
21 |             temperature=0,
22 |         )
23 |         embedding = OpenAIEmbeddings(client=None)
24 | 
25 |         vector_store = Chroma(
26 |             collection_name=COLLECTION_NAME,
27 |             embedding_function=embedding,
28 |             persist_directory=PERSIST_DIRECTORY,
29 |         )
30 | 
31 |         return ConversationalRetrievalChain.from_llm(
32 |             model,
33 |             retriever=vector_store.as_retriever(),
34 |             return_source_documents=True,
35 |         )
36 | 
37 |     def ask_question(self, question: str):
38 |         response = self.chain({"question": question, "chat_history": self.chat_history})
39 | 
40 |         answer = response["answer"]
41 |         source = response["source_documents"]
42 |         self.chat_history.append(HumanMessage(content=question))
43 |         self.chat_history.append(AIMessage(content=answer))
44 | 
45 |         return answer, source
46 | 


--------------------------------------------------------------------------------
/app/streamlit_app.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | from streamlit_chat import message
 3 | from PIL import Image
 4 | from query.vortex_query import VortexQuery
 5 | 
 6 | 
 7 | def initialize_page():
 8 |     st.set_page_config(page_title='DocuVortex', page_icon=':books:')
 9 |     st.image(logo_image, width=80)
10 |     st.header("NeonShield DocuVortex")
11 |     st.markdown("[Github](https://github.com/pkalkman/python-docuvortex)")
12 | 
13 | 
14 | def handle_query_form():
15 |     with st.form(key='query_form'):
16 |         user_query = st.text_input('Search for: ', '', key='input',
17 |                                    help='Enter your search query?')
18 |         submit_button = st.form_submit_button('Submit')
19 |     return user_query, submit_button
20 | 
21 | 
22 | def display_chat_history():
23 |     for i, (user_msg, ai_msg) in enumerate(zip(st.session_state['past'][::-1],
24 |                                                st.session_state['generated'][::-1])):
25 |         message(user_msg, is_user=True, key=f"user_{i}")
26 |         message(ai_msg, key=f"ai_{i}")
27 | 
28 | 
29 | def query(question: str) -> str:
30 |     """
31 |     Query the VortexQuery model with the provided question
32 |     :param question: The question to ask the model
33 |     :return: The answer from the model
34 |     """
35 |     vortex_query = VortexQuery()
36 |     answer, _ = vortex_query.ask_question(question)
37 |     return answer
38 | 
39 | 
40 | logo_image = Image.open('./logo.png')
41 | 
42 | # Initialize page and session state
43 | st.session_state.setdefault('generated', [])
44 | st.session_state.setdefault('past', [])
45 | 
46 | initialize_page()
47 | user_query, submit_button = handle_query_form()
48 | 
49 | if submit_button and user_query:
50 |     model_response = query(user_query)
51 |     st.session_state.past.append(user_query)
52 |     st.session_state.generated.append(model_response)
53 | 
54 | display_chat_history()
55 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Harnessing the Vortex: Building a Document-Based Q&A System Using OpenAI and Python
 2 | 
 3 | ## Leveraging the Power of Large Language Models and the Langchain Framework for an Innovative Approach to Document Querying
 4 | 
 5 | ![DocuVortex](/vortex.png "DocuVortex")
 6 | 
 7 | 
 8 | This project aims to implement a document-based question-answering system using the power of OpenAI's GPT-3.5 Turbo model, Python, and the Langchain Framework. It processes PDF documents, breaking them into ingestible chunks, and then stores these chunks into a Chroma DB vector database for querying. It complements a Medium article called [Howto Build a Document-Based Q&A System Using OpenAI and Python](https://medium.com/itnext/how-to-build-a-document-based-q-a-system-using-openai-and-python-17d1c3cc2081).
 9 | 
10 | ## Getting Started
11 | 
12 | These instructions will get you a copy of the project up and running on your local machine for development and testing purposes.
13 | 
14 | ### Prerequisites
15 | 
16 | To install the project, you need to have [Python](https://www.python.org/downloads/) installed on your machine.
17 | 
18 | ### Installing
19 | 
20 | The project uses [Poetry](https://python-poetry.org/) for managing dependencies. After cloning the repository, navigate to the project directory and install dependencies with the following commands:
21 | 
22 | ```bash
23 | poetry install
24 | poetry shell
25 | ```
26 | 
27 | ## Running the Application
28 | Before you can run ingesting or querying you have to make sure that a .env file exists. This file should have a single line that read ```OPENAI_API_KEY=yourkey```
29 | 
30 | ### Ingesting Documents
31 | To ingest documents, place your PDF files in the 'docs' folder make sure that you are in the app folder and run the following command:
32 | 
33 | ```bash
34 | cd app
35 | python ingest.py
36 | ```
37 | 
38 | ### Querying Documents
39 | To query the ingested documents, make sure that you are in the app folder, run the following command and follow the interactive prompts:
40 | 
41 | ```bash
42 | cd app
43 | python query.py
44 | ```
45 | 
46 | ### Running the Streamlit App
47 | To visualize and interact with the system via the Streamlit app, run the following command:
48 | 
49 | ```bash
50 | streamlit run streamlit_app.py
51 | ```
52 | 
53 | ### Authors
54 | [Patrick Kalkman](https://github.com/PatrickKalkman)
55 | 
56 | ### License
57 | This project is licensed under the MIT license - see the LICENSE.md file for details
58 | 
59 | ### Acknowledgments
60 | - [Langchain Framework](https://python.langchain.com/en/latest/index.html)
61 | - [OpenAI](https://openai.com/)
62 | - [Chroma DB](https://www.trychroma.com/)
63 | - [Streamlit](https://streamlit.io/)
64 | 
65 | 
66 | 


--------------------------------------------------------------------------------
/tests/test_vortex_pfd_parser.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from app.ingest.vortex_pdf_parser import VortexPdfParser
 4 | 
 5 | 
 6 | @pytest.fixture
 7 | def pdf_file():
 8 |     return "./tests/test_docs/Cyber_Security_Threats_in_Cloud_Literature_Review.pdf"
 9 | 
10 | 
11 | pdf_test_cases = [
12 |     ("./tests/test_docs/Cyber_Security_Threats_in_Cloud_Literature_Review.pdf",
13 |      "Cyber Security Threats in Cloud: Literature Review", "Almaiah", "2021-06-15"),
14 |     ("./tests/test_docs/mirkovic_benzel_teachingcybersecurity.pdf",
15 |      "", "", ""),
16 |     ("./tests/test_docs/w28196.pdf",
17 |      "Cybersecurity Risk", "", "")
18 | ]
19 | 
20 | 
21 | @pytest.mark.parametrize("pdf_file,title,author,creation_date", pdf_test_cases)
22 | def test_extract_metadata_from_pdf(pdf_file, title, author, creation_date):
23 |     parser = VortexPdfParser()
24 |     parser.set_pdf_file_path(pdf_file)
25 |     metadata = parser.extract_metadata_from_pdf()
26 |     assert metadata['title'] == title
27 |     assert author in metadata['author']
28 |     assert creation_date in metadata['creation_date']
29 | 
30 | 
31 | def test_extract_pages_from_pdf(pdf_file):
32 |     parser = VortexPdfParser()
33 |     parser.set_pdf_file_path(pdf_file)
34 |     pages = parser.extract_pages_from_pdf()
35 |     assert (len(pages) == 8)
36 |     assert ('collection of most\n352 studies are remained' in pages[0][1])
37 | 
38 | 
39 | def test_parse_pdf(pdf_file):
40 |     parser = VortexPdfParser()
41 |     parser.set_pdf_file_path(pdf_file)
42 |     pages, metadata = parser.parse_pdf()
43 |     assert (metadata['title'] == "Cyber Security Threats in Cloud: Literature Review")
44 |     assert ('Almaiah' in metadata['author'])
45 |     assert ('2021-06-15' in metadata['creation_date'])
46 |     assert (len(pages) == 8)
47 |     assert ('collection of most\n352 studies are remained' in pages[0][1])
48 | 
49 | 
50 | def test_clean_text(pdf_file):
51 |     parser = VortexPdfParser()
52 |     parser.set_pdf_file_path(pdf_file)
53 |     pages = parser.extract_pages_from_pdf()
54 |     cleaning_functions = [parser.merge_hyphenated_words, parser.fix_newlines,
55 |                           parser.remove_multiple_newlines]
56 |     cleaned_pages = parser.clean_text(pages, cleaning_functions)
57 |     assert (len(cleaned_pages) == 8)
58 |     assert ('collection of most 352 studies are remained' in cleaned_pages[0][1])
59 | 
60 | 
61 | def test_text_to_docs(pdf_file):
62 |     parser = VortexPdfParser()
63 |     parser.set_pdf_file_path(pdf_file)
64 |     pages = parser.extract_pages_from_pdf()
65 |     metadata = parser.extract_metadata_from_pdf()
66 |     docs = parser.text_to_docs(pages, metadata)
67 |     assert ('In recent years, data has been expanding' in docs[0].page_content)
68 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | **/.DS_Store
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | # Distribution / packaging
 12 | .Python
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | wheels/
 25 | share/python-wheels/
 26 | *.egg-info/
 27 | .installed.cfg
 28 | *.egg
 29 | MANIFEST
 30 | 
 31 | # PyInstaller
 32 | #  Usually these files are written by a python script from a template
 33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 34 | *.manifest
 35 | *.spec
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .nox/
 45 | .coverage
 46 | .coverage.*
 47 | .cache
 48 | nosetests.xml
 49 | coverage.xml
 50 | *.cover
 51 | *.py,cover
 52 | .hypothesis/
 53 | .pytest_cache/
 54 | cover/
 55 | 
 56 | # Translations
 57 | *.mo
 58 | *.pot
 59 | 
 60 | # Django stuff:
 61 | *.log
 62 | local_settings.py
 63 | db.sqlite3
 64 | db.sqlite3-journal
 65 | 
 66 | # Flask stuff:
 67 | instance/
 68 | .webassets-cache
 69 | 
 70 | # Scrapy stuff:
 71 | .scrapy
 72 | 
 73 | # Sphinx documentation
 74 | docs/_build/
 75 | 
 76 | # PyBuilder
 77 | .pybuilder/
 78 | target/
 79 | 
 80 | # Jupyter Notebook
 81 | .ipynb_checkpoints
 82 | 
 83 | # IPython
 84 | profile_default/
 85 | ipython_config.py
 86 | 
 87 | # pyenv
 88 | #   For a library or package, you might want to ignore these files since the code is
 89 | #   intended to run in multiple environments; otherwise, check them in:
 90 | # .python-version
 91 | 
 92 | # pipenv
 93 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 94 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 95 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 96 | #   install all needed dependencies.
 97 | #Pipfile.lock
 98 | 
 99 | # poetry
100 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
101 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
102 | #   commonly ignored for libraries.
103 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
104 | #poetry.lock
105 | 
106 | # pdm
107 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
108 | #pdm.lock
109 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
110 | #   in version control.
111 | #   https://pdm.fming.dev/#use-with-ide
112 | .pdm.toml
113 | 
114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115 | __pypackages__/
116 | 
117 | # Celery stuff
118 | celerybeat-schedule
119 | celerybeat.pid
120 | 
121 | # SageMath parsed files
122 | *.sage.py
123 | 
124 | # Environments
125 | .env
126 | .venv
127 | env/
128 | venv/
129 | ENV/
130 | env.bak/
131 | venv.bak/
132 | 
133 | # Spyder project settings
134 | .spyderproject
135 | .spyproject
136 | 
137 | # Rope project settings
138 | .ropeproject
139 | 
140 | # mkdocs documentation
141 | /site
142 | 
143 | # mypy
144 | .mypy_cache/
145 | .dmypy.json
146 | dmypy.json
147 | 
148 | # Pyre type checker
149 | .pyre/
150 | 
151 | # pytype static type analyzer
152 | .pytype/
153 | 
154 | # Cython debug symbols
155 | cython_debug/
156 | 
157 | # PyCharm
158 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
161 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
162 | #.idea/
163 | 


--------------------------------------------------------------------------------
/app/ingest/vortex_pdf_parser.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | from datetime import date
  4 | from typing import Callable, Dict, List, Tuple
  5 | 
  6 | import langchain.docstore.document as docstore
  7 | import langchain.text_splitter as splitter
  8 | import pdfplumber
  9 | from loguru import logger
 10 | from pypdf import PdfReader
 11 | 
 12 | from .utils import getattr_or_default
 13 | 
 14 | 
 15 | class VortexPdfParser:
 16 |     """A parser for extracting and cleaning text from PDF documents."""
 17 | 
 18 |     def set_pdf_file_path(self, pdf_file_path: str):
 19 |         """Set the path to the PDF file."""
 20 |         if not os.path.isfile(pdf_file_path):
 21 |             raise FileNotFoundError(f"File not found: {pdf_file_path}")
 22 |         self.pdf_file_path = pdf_file_path
 23 | 
 24 |     def clean_text_to_docs(self) -> List[docstore.Document]:
 25 |         raw_pages, metadata = self.parse_pdf()
 26 | 
 27 |         cleaning_functions: List = [
 28 |             self.merge_hyphenated_words,
 29 |             self.fix_newlines,
 30 |             self.remove_multiple_newlines,
 31 |         ]
 32 | 
 33 |         cleaned_text_pdf = self.clean_text(raw_pages, cleaning_functions)
 34 |         return self.text_to_docs(cleaned_text_pdf, metadata)
 35 | 
 36 |     def parse_pdf(self) -> Tuple[List[Tuple[int, str]], Dict[str, str]]:
 37 |         """Extract and return the pages and metadata from the PDF."""
 38 |         metadata = self.extract_metadata_from_pdf()
 39 |         pages = self.extract_pages_from_pdf()
 40 |         return pages, metadata
 41 | 
 42 |     def extract_metadata_from_pdf(self) -> Dict[str, str]:
 43 |         """Extract and return the metadata from the PDF."""
 44 |         logger.info("Extracting metadata")
 45 |         with open(self.pdf_file_path, "rb") as pdf_file:
 46 |             reader = PdfReader(pdf_file)
 47 |             metadata = reader.metadata
 48 |             logger.info(f"{getattr(metadata, 'title', 'no title')}")
 49 |             default_date = date(1900, 1, 1)
 50 |             return {
 51 |                 "title": getattr_or_default(metadata, 'title', '').strip(),
 52 |                 "author": getattr_or_default(metadata, 'author', '').strip(),
 53 |                 "creation_date": getattr_or_default(metadata,
 54 |                                                     'creation_date',
 55 |                                                     default_date).strftime('%Y-%m-%d'),
 56 |             }
 57 | 
 58 |     def extract_pages_from_pdf(self) -> List[Tuple[int, str]]:
 59 |         """Extract and return the text of each page from the PDF."""
 60 |         logger.info("Extracting pages")
 61 |         with pdfplumber.open(self.pdf_file_path) as pdf:
 62 |             return [(i + 1, p.extract_text())
 63 |                     for i, p in enumerate(pdf.pages) if p.extract_text().strip()]
 64 | 
 65 |     def clean_text(self,
 66 |                    pages: List[Tuple[int, str]],
 67 |                    cleaning_functions: List[Callable[[str], str]]
 68 |                    ) -> List[Tuple[int, str]]:
 69 |         """Apply the cleaning functions to the text of each page."""
 70 |         logger.info("Cleaning text of each page")
 71 |         cleaned_pages = []
 72 |         for page_num, text in pages:
 73 |             for cleaning_function in cleaning_functions:
 74 |                 text = cleaning_function(text)
 75 |             cleaned_pages.append((page_num, text))
 76 |         return cleaned_pages
 77 | 
 78 |     def merge_hyphenated_words(self, text: str) -> str:
 79 |         """Merge words in the text that have been split with a hyphen."""
 80 |         return re.sub(r"(\w)-\n(\w)", r"\1\2", text)
 81 | 
 82 |     def fix_newlines(self, text: str) -> str:
 83 |         """Replace single newline characters in the text with spaces."""
 84 |         return re.sub(r"(?<!\n)\n(?!\n)", " ", text)
 85 | 
 86 |     def remove_multiple_newlines(self, text: str) -> str:
 87 |         """Reduce multiple newline characters in the text to a single newline."""
 88 |         return re.sub(r"\n{2,}", "\n", text)
 89 | 
 90 |     def text_to_docs(self, text: List[Tuple[int, str]],
 91 |                      metadata: Dict[str, str]) -> List[docstore.Document]:
 92 |         """Split the text into chunks and return them as Documents."""
 93 |         doc_chunks: List[docstore.Document] = []
 94 | 
 95 |         for page_num, page in text:
 96 |             logger.info(f"Splitting page {page_num}")
 97 |             text_splitter = splitter.RecursiveCharacterTextSplitter(
 98 |                 chunk_size=1000,
 99 |                 separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
100 |                 chunk_overlap=200,
101 |             )
102 |             chunks = text_splitter.split_text(page)
103 |             for i, chunk in enumerate(chunks):
104 |                 doc = docstore.Document(
105 |                     page_content=chunk,
106 |                     metadata={
107 |                         "page_number": page_num,
108 |                         "chunk": i,
109 |                         "source": f"p{page_num}-{i}",
110 |                         **metadata,
111 |                     },
112 |                 )
113 |                 doc_chunks.append(doc)
114 |         return doc_chunks
115 | 


--------------------------------------------------------------------------------