├── .env.example
├── .github
    ├── architecture.png
    └── ui.png
├── .gitignore
├── .streamlit
    └── config.toml
├── .vscode
    └── settings.json
├── LICENSE
├── README.md
├── app.py
├── images
    ├── assistant-avatar.png
    └── user-avatar.png
├── notebooks
    └── reranking-and-filtering.ipynb
├── poetry.lock
├── pyproject.toml
├── ragbase
    ├── __init__.py
    ├── chain.py
    ├── config.py
    ├── ingestor.py
    ├── model.py
    ├── retriever.py
    ├── session_history.py
    └── uploader.py
└── requirements.txt


/.env.example:
--------------------------------------------------------------------------------
1 | GROQ_API_KEY="GROQ API KEY HERE"


--------------------------------------------------------------------------------
/.github/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/curiousily/ragbase/5af7b79162329fa0b725b39d1aeeea7eb1e4428b/.github/architecture.png


--------------------------------------------------------------------------------
/.github/ui.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/curiousily/ragbase/5af7b79162329fa0b725b39d1aeeea7eb1e4428b/.github/ui.png


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110 | .pdm.toml
111 | .pdm-python
112 | .pdm-build/
113 | 
114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115 | __pypackages__/
116 | 
117 | # Celery stuff
118 | celerybeat-schedule
119 | celerybeat.pid
120 | 
121 | # SageMath parsed files
122 | *.sage.py
123 | 
124 | # Environments
125 | .env
126 | .venv
127 | env/
128 | venv/
129 | ENV/
130 | env.bak/
131 | venv.bak/
132 | 
133 | # Spyder project settings
134 | .spyderproject
135 | .spyproject
136 | 
137 | # Rope project settings
138 | .ropeproject
139 | 
140 | # mkdocs documentation
141 | /site
142 | 
143 | # mypy
144 | .mypy_cache/
145 | .dmypy.json
146 | dmypy.json
147 | 
148 | # Pyre type checker
149 | .pyre/
150 | 
151 | # pytype static type analyzer
152 | .pytype/
153 | 
154 | # Cython debug symbols
155 | cython_debug/
156 | 
157 | # PyCharm
158 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
161 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
162 | #.idea/
163 | 
164 | .DS_Store
165 | docs-db
166 | tmp


--------------------------------------------------------------------------------
/.streamlit/config.toml:
--------------------------------------------------------------------------------
1 | [server]
2 | maxUploadSize = 2
3 | 
4 | [theme]
5 | base="dark"
6 | primaryColor="#0284c7"
7 | backgroundColor="#111111"
8 | secondaryBackgroundColor="#1e1e1e"


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "[python]": {
 3 |       "editor.formatOnSave": true,
 4 |       "editor.defaultFormatter": "charliermarsh.ruff",
 5 |       "editor.codeActionsOnSave": {
 6 |         "source.fixAll": "never",
 7 |         "source.organizeImports": "explicit"
 8 |       }
 9 |     }
10 |   }
11 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Venelin Valkov
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # RagBase - Private Chat with Your Documents
 2 | 
 3 | > Completely local RAG with chat UI
 4 | 
 5 | <a href="https://www.mlexpert.io/bootcamp" target="_blank">
 6 |   <img src="https://raw.githubusercontent.com/curiousily/ragbase/master/.github/ui.png">
 7 | </a>
 8 | 
 9 | ## Demo
10 | 
11 | Check out the [RagBase on Streamlit Cloud](https://ragbase.streamlit.app/). Runs with Groq API.
12 | 
13 | ## Installation
14 | 
15 | Clone the repo:
16 | 
17 | ```sh
18 | git clone git@github.com:curiousily/ragbase.git
19 | cd ragbase
20 | ```
21 | 
22 | Install the dependencies (requires Poetry):
23 | 
24 | ```sh
25 | poetry install
26 | ```
27 | 
28 | Fetch your LLM (gemma2:9b by default):
29 | 
30 | ```sh
31 | ollama pull gemma2:9b
32 | ```
33 | 
34 | Run the Ollama server
35 | 
36 | ```sh
37 | ollama serve
38 | ```
39 | 
40 | Start RagBase:
41 | 
42 | ```sh
43 | poetry run streamlit run app.py
44 | ```
45 | 
46 | ## Architecture
47 | 
48 | <a href="https://www.mlexpert.io/bootcamp" target="_blank">
49 |   <img src="https://raw.githubusercontent.com/curiousily/ragbase/master/.github/architecture.png">
50 | </a>
51 | 
52 | ### Ingestor
53 | 
54 | Extracts text from PDF documents and creates chunks (using semantic and character splitter) that are stored in a vector databse
55 | 
56 | ### Retriever
57 | 
58 | Given a query, searches for similar documents, reranks the result and applies LLM chain filter before returning the response.
59 | 
60 | ### QA Chain
61 | 
62 | Combines the LLM with the retriever to answer a given user question
63 | 
64 | ## Tech Stack
65 | 
66 | - [Ollama](https://ollama.com/) - run local LLM
67 | - [Groq API](https://groq.com/) - fast inference for mutliple LLMs
68 | - [LangChain](https://www.langchain.com/) - build LLM-powered apps
69 | - [Qdrant](https://qdrant.tech/) - vector search/database
70 | - [FlashRank](https://github.com/PrithivirajDamodaran/FlashRank) - fast reranking
71 | - [FastEmbed](https://qdrant.github.io/fastembed/) - lightweight and fast embedding generation
72 | - [Streamlit](https://streamlit.io/) - build UI for data apps
73 | - [PDFium](https://pdfium.googlesource.com/pdfium/) - PDF processing and text extraction
74 | 
75 | ## Add Groq API Key (Optional)
76 | 
77 | You can also use the Groq API to replace the local LLM, for that you'll need a `.env` file with Groq API key:
78 | 
79 | ```sh
80 | GROQ_API_KEY=YOUR API KEY
81 | ```


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import random
  3 | 
  4 | import streamlit as st
  5 | from dotenv import load_dotenv
  6 | 
  7 | from ragbase.chain import ask_question, create_chain
  8 | from ragbase.config import Config
  9 | from ragbase.ingestor import Ingestor
 10 | from ragbase.model import create_llm
 11 | from ragbase.retriever import create_retriever
 12 | from ragbase.uploader import upload_files
 13 | 
 14 | load_dotenv()
 15 | 
 16 | LOADING_MESSAGES = [
 17 |     "Calculating your answer through multiverse...",
 18 |     "Adjusting quantum entanglement...",
 19 |     "Summoning star wisdom... almost there!",
 20 |     "Consulting Schrödinger's cat...",
 21 |     "Warping spacetime for your response...",
 22 |     "Balancing neutron star equations...",
 23 |     "Analyzing dark matter... please wait...",
 24 |     "Engaging hyperdrive... en route!",
 25 |     "Gathering photons from a galaxy...",
 26 |     "Beaming data from Andromeda... stand by!",
 27 | ]
 28 | 
 29 | 
 30 | @st.cache_resource(show_spinner=False)
 31 | def build_qa_chain(files):
 32 |     file_paths = upload_files(files)
 33 |     vector_store = Ingestor().ingest(file_paths)
 34 |     llm = create_llm()
 35 |     retriever = create_retriever(llm, vector_store=vector_store)
 36 |     return create_chain(llm, retriever)
 37 | 
 38 | 
 39 | async def ask_chain(question: str, chain):
 40 |     full_response = ""
 41 |     assistant = st.chat_message(
 42 |         "assistant", avatar=str(Config.Path.IMAGES_DIR / "assistant-avatar.png")
 43 |     )
 44 |     with assistant:
 45 |         message_placeholder = st.empty()
 46 |         message_placeholder.status(random.choice(LOADING_MESSAGES), state="running")
 47 |         documents = []
 48 |         async for event in ask_question(chain, question, session_id="session-id-42"):
 49 |             if type(event) is str:
 50 |                 full_response += event
 51 |                 message_placeholder.markdown(full_response)
 52 |             if type(event) is list:
 53 |                 documents.extend(event)
 54 |         for i, doc in enumerate(documents):
 55 |             with st.expander(f"Source #{i+1}"):
 56 |                 st.write(doc.page_content)
 57 | 
 58 |     st.session_state.messages.append({"role": "assistant", "content": full_response})
 59 | 
 60 | 
 61 | def show_upload_documents():
 62 |     holder = st.empty()
 63 |     with holder.container():
 64 |         st.header("RagBase")
 65 |         st.subheader("Get answers from your documents")
 66 |         uploaded_files = st.file_uploader(
 67 |             label="Upload PDF files", type=["pdf"], accept_multiple_files=True
 68 |         )
 69 |     if not uploaded_files:
 70 |         st.warning("Please upload PDF documents to continue!")
 71 |         st.stop()
 72 | 
 73 |     with st.spinner("Analyzing your document(s)..."):
 74 |         holder.empty()
 75 |         return build_qa_chain(uploaded_files)
 76 | 
 77 | 
 78 | def show_message_history():
 79 |     for message in st.session_state.messages:
 80 |         role = message["role"]
 81 |         avatar_path = (
 82 |             Config.Path.IMAGES_DIR / "assistant-avatar.png"
 83 |             if role == "assistant"
 84 |             else Config.Path.IMAGES_DIR / "user-avatar.png"
 85 |         )
 86 |         with st.chat_message(role, avatar=str(avatar_path)):
 87 |             st.markdown(message["content"])
 88 | 
 89 | 
 90 | def show_chat_input(chain):
 91 |     if prompt := st.chat_input("Ask your question here"):
 92 |         st.session_state.messages.append({"role": "user", "content": prompt})
 93 |         with st.chat_message(
 94 |             "user",
 95 |             avatar=str(Config.Path.IMAGES_DIR / "user-avatar.png"),
 96 |         ):
 97 |             st.markdown(prompt)
 98 |         asyncio.run(ask_chain(prompt, chain))
 99 | 
100 | 
101 | st.set_page_config(page_title="RagBase", page_icon="🐧")
102 | 
103 | st.html(
104 |     """
105 | <style>
106 |     .st-emotion-cache-p4micv {
107 |         width: 2.75rem;
108 |         height: 2.75rem;
109 |     }
110 | </style>
111 | """
112 | )
113 | 
114 | if "messages" not in st.session_state:
115 |     st.session_state.messages = [
116 |         {
117 |             "role": "assistant",
118 |             "content": "Hi! What do you want to know about your documents?",
119 |         }
120 |     ]
121 | 
122 | if Config.CONVERSATION_MESSAGES_LIMIT > 0 and Config.CONVERSATION_MESSAGES_LIMIT <= len(
123 |     st.session_state.messages
124 | ):
125 |     st.warning(
126 |         "You have reached the conversation limit. Refresh the page to start a new conversation."
127 |     )
128 |     st.stop()
129 | 
130 | chain = show_upload_documents()
131 | show_message_history()
132 | show_chat_input(chain)
133 | 


--------------------------------------------------------------------------------
/images/assistant-avatar.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/curiousily/ragbase/5af7b79162329fa0b725b39d1aeeea7eb1e4428b/images/assistant-avatar.png


--------------------------------------------------------------------------------
/images/user-avatar.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/curiousily/ragbase/5af7b79162329fa0b725b39d1aeeea7eb1e4428b/images/user-avatar.png


--------------------------------------------------------------------------------
/notebooks/reranking-and-filtering.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "from langchain_core.documents import Document\n",
 10 |     "from ragbase.model import create_reranker, create_llm\n",
 11 |     "from langchain.retrievers.document_compressors.chain_filter import LLMChainFilter\n",
 12 |     "from langchain.globals import set_verbose\n",
 13 |     "\n",
 14 |     "set_verbose(True)"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 2,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "reranker = create_reranker()"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": 3,
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "documents = [\n",
 33 |     "    Document(\"\"\"Not since Porsche blew air cooling out the back door in 1999 with the 996-\n",
 34 |     "generation 911 has there been a bigger change that will upset more Porsche fans\n",
 35 |     "than the hybridization of the 911. Fans, however, are not necessarily owners, and\n",
 36 |     "those with the means will not mind the 21st-century tech one bit, because if\n",
 37 |     "Porsche didn't tell anyone this updated 992.2 was a hybrid, no one would know by\n",
 38 |     "how it drives.\n",
 39 |     "\"\"\"),\n",
 40 |     "    Document(\"\"\"For now, the new 3.6-liter flat-six operating in perfect stoichiometry all the time,\n",
 41 |     "its electrified turbocharger, and eight-speed dual-clutch gearbox with an integral\n",
 42 |     "electric motor are limited to the GTS, which used to be powered by a jazzed-up\n",
 43 |     "version of the twin-turbo 3.0-liter in the base Carrera.\n",
 44 |     "\"\"\"),\n",
 45 |     "    Document(\"\"\"\n",
 46 |     "Anyway, the screen is cheaper, but at least Porsche does some cool stuff with it.\n",
 47 |     "There are seven different views, including a classic five-gauge cluster, but the most\n",
 48 |     "interesting of them is a track-focused mode that clocks the tach so that the redline\n",
 49 |     "is near 12 o'clock. Porsche would have scored more points if it had kept the \n",
 50 |     "orientation of the tach numbers so that they locked with the twist, like a real\n",
 51 |     "clocked tach out of a 917.\n",
 52 |     "\"\"\"),\n",
 53 |     "    Document(\"\"\"The bad news is that the GTS coupe starts at $166,895. The back seat is no longer\n",
 54 |     "standard, though adding the +2 second row is a no-cost option. That's mega\n",
 55 |     "money, but until Porsche sees sales drop, you better believe it will keep raising its\n",
 56 |     "prices. That's just basic supply-demand, and plenty of people will line up for this\n",
 57 |     "water-cooled hybrid.\n",
 58 |     "\"\"\")\n",
 59 |     "]"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "markdown",
 64 |    "metadata": {},
 65 |    "source": [
 66 |     "## Reranker"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": 4,
 72 |    "metadata": {},
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "reranked_documents = reranker.compress_documents(documents, \"What is the price?\")"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 5,
 81 |    "metadata": {},
 82 |    "outputs": [
 83 |     {
 84 |      "name": "stdout",
 85 |      "output_type": "stream",
 86 |      "text": [
 87 |       "dict_keys(['id', 'metadata', 'page_content', 'type'])\n",
 88 |       "score: 0.12178549\n",
 89 |       "The bad news is that the GTS coupe starts at $166,895. The back seat is no longer\n",
 90 |       "standard, though adding the +2 second row is a no-cost option. That's mega\n",
 91 |       "money, but until Porsche sees sales drop, you better believe it will keep raising its\n",
 92 |       "prices. That's just basic supply-demand, and plenty of people will line up for this\n",
 93 |       "water-cooled hybrid.\n",
 94 |       "\n"
 95 |      ]
 96 |     }
 97 |    ],
 98 |    "source": [
 99 |     "first_result = reranked_documents[0]\n",
100 |     "print(first_result.__dict__.keys())\n",
101 |     "print(\"score:\", first_result.metadata[\"relevance_score\"])\n",
102 |     "print(first_result.page_content)"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "markdown",
107 |    "metadata": {},
108 |    "source": [
109 |     "## LLM Chain Filter"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": 6,
115 |    "metadata": {},
116 |    "outputs": [
117 |     {
118 |      "name": "stdout",
119 |      "output_type": "stream",
120 |      "text": [
121 |       "\n",
122 |       "\n",
123 |       "\u001b[1m> Entering new LLMChain chain...\u001b[0m\n",
124 |       "\n",
125 |       "\u001b[1m> Entering new LLMChain chain...\u001b[0m\n",
126 |       "Prompt after formatting:\n",
127 |       "\u001b[32;1m\u001b[1;3mGiven the following question and context, return YES if the context is relevant to the question and NO if it isn't.\n",
128 |       "\n",
129 |       "> Question: How many seats in the car?\n",
130 |       "> Context:\n",
131 |       ">>>\n",
132 |       "For now, the new 3.6-liter flat-six operating in perfect stoichiometry all the time,\n",
133 |       "its electrified turbocharger, and eight-speed dual-clutch gearbox with an integral\n",
134 |       "electric motor are limited to the GTS, which used to be powered by a jazzed-up\n",
135 |       "version of the twin-turbo 3.0-liter in the base Carrera.\n",
136 |       "\n",
137 |       ">>>\n",
138 |       "> Relevant (YES / NO):\u001b[0m\n",
139 |       "\n",
140 |       "Prompt after formatting:\n",
141 |       "\u001b[32;1m\u001b[1;3mGiven the following question and context, return YES if the context is relevant to the question and NO if it isn't.\n",
142 |       "\n",
143 |       "> Question: How many seats in the car?\n",
144 |       "> Context:\n",
145 |       ">>>\n",
146 |       "Not since Porsche blew air cooling out the back door in 1999 with the 996-\n",
147 |       "generation 911 has there been a bigger change that will upset more Porsche fans\n",
148 |       "than the hybridization of the 911. Fans, however, are not necessarily owners, and\n",
149 |       "those with the means will not mind the 21st-century tech one bit, because if\n",
150 |       "Porsche didn't tell anyone this updated 992.2 was a hybrid, no one would know by\n",
151 |       "how it drives.\n",
152 |       "\n",
153 |       ">>>\n",
154 |       "> Relevant (YES / NO):\u001b[0m\n",
155 |       "\n",
156 |       "\n",
157 |       "\u001b[1m> Entering new LLMChain chain...\u001b[0m\n",
158 |       "Prompt after formatting:\n",
159 |       "\u001b[32;1m\u001b[1;3mGiven the following question and context, return YES if the context is relevant to the question and NO if it isn't.\n",
160 |       "\n",
161 |       "> Question: How many seats in the car?\n",
162 |       "> Context:\n",
163 |       ">>>\n",
164 |       "\n",
165 |       "Anyway, the screen is cheaper, but at least Porsche does some cool stuff with it.\n",
166 |       "There are seven different views, including a classic five-gauge cluster, but the most\n",
167 |       "interesting of them is a track-focused mode that clocks the tach so that the redline\n",
168 |       "is near 12 o'clock. Porsche would have scored more points if it had kept the \n",
169 |       "orientation of the tach numbers so that they locked with the twist, like a real\n",
170 |       "clocked tach out of a 917.\n",
171 |       "\n",
172 |       ">>>\n",
173 |       "> Relevant (YES / NO):\u001b[0m\n",
174 |       "\n",
175 |       "\n",
176 |       "\u001b[1m> Entering new LLMChain chain...\u001b[0m\n",
177 |       "Prompt after formatting:\n",
178 |       "\u001b[32;1m\u001b[1;3mGiven the following question and context, return YES if the context is relevant to the question and NO if it isn't.\n",
179 |       "\n",
180 |       "> Question: How many seats in the car?\n",
181 |       "> Context:\n",
182 |       ">>>\n",
183 |       "The bad news is that the GTS coupe starts at $166,895. The back seat is no longer\n",
184 |       "standard, though adding the +2 second row is a no-cost option. That's mega\n",
185 |       "money, but until Porsche sees sales drop, you better believe it will keep raising its\n",
186 |       "prices. That's just basic supply-demand, and plenty of people will line up for this\n",
187 |       "water-cooled hybrid.\n",
188 |       "\n",
189 |       ">>>\n",
190 |       "> Relevant (YES / NO):\u001b[0m\n"
191 |      ]
192 |     },
193 |     {
194 |      "name": "stderr",
195 |      "output_type": "stream",
196 |      "text": [
197 |       "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
198 |       "To disable this warning, you can either:\n",
199 |       "\t- Avoid using `tokenizers` before the fork if possible\n",
200 |       "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
201 |       "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
202 |       "To disable this warning, you can either:\n",
203 |       "\t- Avoid using `tokenizers` before the fork if possible\n",
204 |       "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
205 |       "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
206 |       "To disable this warning, you can either:\n",
207 |       "\t- Avoid using `tokenizers` before the fork if possible\n",
208 |       "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
209 |       "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
210 |       "To disable this warning, you can either:\n",
211 |       "\t- Avoid using `tokenizers` before the fork if possible\n",
212 |       "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
213 |       "INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
214 |       "INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
215 |       "INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions \"HTTP/1.1 200 OK\"\n"
216 |      ]
217 |     },
218 |     {
219 |      "name": "stdout",
220 |      "output_type": "stream",
221 |      "text": [
222 |       "\n",
223 |       "\u001b[1m> Finished chain.\u001b[0m\n",
224 |       "\n",
225 |       "\u001b[1m> Finished chain.\u001b[0m\n",
226 |       "\n",
227 |       "\u001b[1m> Finished chain.\u001b[0m\n"
228 |      ]
229 |     },
230 |     {
231 |      "name": "stderr",
232 |      "output_type": "stream",
233 |      "text": [
234 |       "INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions \"HTTP/1.1 200 OK\"\n"
235 |      ]
236 |     },
237 |     {
238 |      "name": "stdout",
239 |      "output_type": "stream",
240 |      "text": [
241 |       "\n",
242 |       "\u001b[1m> Finished chain.\u001b[0m\n"
243 |      ]
244 |     }
245 |    ],
246 |    "source": [
247 |     "chain_filter = LLMChainFilter.from_llm(create_llm())\n",
248 |     "filtered_documents = chain_filter.compress_documents(documents, \"How many seats in the car?\")"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "code",
253 |    "execution_count": 7,
254 |    "metadata": {},
255 |    "outputs": [
256 |     {
257 |      "name": "stdout",
258 |      "output_type": "stream",
259 |      "text": [
260 |       "1\n",
261 |       "The bad news is that the GTS coupe starts at $166,895. The back seat is no longer\n",
262 |       "standard, though adding the +2 second row is a no-cost option. That's mega\n",
263 |       "money, but until Porsche sees sales drop, you better believe it will keep raising its\n",
264 |       "prices. That's just basic supply-demand, and plenty of people will line up for this\n",
265 |       "water-cooled hybrid.\n",
266 |       "\n"
267 |      ]
268 |     }
269 |    ],
270 |    "source": [
271 |     "print(len(filtered_documents))\n",
272 |     "print(filtered_documents[0].page_content)"
273 |    ]
274 |   }
275 |  ],
276 |  "metadata": {
277 |   "kernelspec": {
278 |    "display_name": "ragbase-YTjELISx-py3.12",
279 |    "language": "python",
280 |    "name": "python3"
281 |   },
282 |   "language_info": {
283 |    "codemirror_mode": {
284 |     "name": "ipython",
285 |     "version": 3
286 |    },
287 |    "file_extension": ".py",
288 |    "mimetype": "text/x-python",
289 |    "name": "python",
290 |    "nbconvert_exporter": "python",
291 |    "pygments_lexer": "ipython3",
292 |    "version": "3.12.4"
293 |   }
294 |  },
295 |  "nbformat": 4,
296 |  "nbformat_minor": 2
297 | }
298 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "ragbase"
 3 | version = "0.1.0"
 4 | description = ""
 5 | authors = ["Venelin Valkov <venelin@curiousily.com>"]
 6 | license = "MIT"
 7 | readme = "README.md"
 8 | 
 9 | [tool.poetry.dependencies]
10 | python = ">=3.12,<3.13"
11 | langchain-community = "^0.2.6"
12 | flashrank = "^0.2.8"
13 | qdrant-client = "^1.10.1"
14 | pypdfium2 = "^4.30.0"
15 | fastembed = "^0.3.3"
16 | langchain-experimental = "^0.0.62"
17 | langchain-qdrant = "^0.1.1"
18 | streamlit = "^1.36.0"
19 | python-dotenv = "^1.0.1"
20 | langchain-groq = "^0.1.6"
21 | 
22 | 
23 | [tool.poetry.group.dev.dependencies]
24 | ruff = "^0.5.1"
25 | jupyterlab = "^4.2.3"
26 | watchdog = "^4.0.1"
27 | 
28 | [build-system]
29 | requires = ["poetry-core"]
30 | build-backend = "poetry.core.masonry.api"
31 | 


--------------------------------------------------------------------------------
/ragbase/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/curiousily/ragbase/5af7b79162329fa0b725b39d1aeeea7eb1e4428b/ragbase/__init__.py


--------------------------------------------------------------------------------
/ragbase/chain.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from operator import itemgetter
 3 | from typing import List
 4 | 
 5 | from langchain.schema.runnable import RunnablePassthrough
 6 | from langchain_core.documents import Document
 7 | from langchain_core.language_models import BaseLanguageModel
 8 | from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
 9 | from langchain_core.runnables import Runnable
10 | from langchain_core.runnables.history import RunnableWithMessageHistory
11 | from langchain_core.tracers.stdout import ConsoleCallbackHandler
12 | from langchain_core.vectorstores import VectorStoreRetriever
13 | 
14 | from ragbase.config import Config
15 | from ragbase.session_history import get_session_history
16 | 
17 | SYSTEM_PROMPT = """
18 | Utilize the provided contextual information to respond to the user question.
19 | If the answer is not found within the context, state that the answer cannot be found.
20 | Prioritize concise responses (maximum of 3 sentences) and use a list where applicable.
21 | The contextual information is organized with the most relevant source appearing first.
22 | Each source is separated by a horizontal rule (---).
23 | 
24 | Context:
25 | {context}
26 | 
27 | Use markdown formatting where appropriate.
28 | """
29 | 
30 | 
31 | def remove_links(text: str) -> str:
32 |     url_pattern = r"https?://\S+|www\.\S+"
33 |     return re.sub(url_pattern, "", text)
34 | 
35 | 
36 | def format_documents(documents: List[Document]) -> str:
37 |     texts = []
38 |     for doc in documents:
39 |         texts.append(doc.page_content)
40 |         texts.append("---")
41 | 
42 |     return remove_links("\n".join(texts))
43 | 
44 | 
45 | def create_chain(llm: BaseLanguageModel, retriever: VectorStoreRetriever) -> Runnable:
46 |     prompt = ChatPromptTemplate.from_messages(
47 |         [
48 |             ("system", SYSTEM_PROMPT),
49 |             MessagesPlaceholder("chat_history"),
50 |             ("human", "{question}"),
51 |         ]
52 |     )
53 | 
54 |     chain = (
55 |         RunnablePassthrough.assign(
56 |             context=itemgetter("question")
57 |             | retriever.with_config({"run_name": "context_retriever"})
58 |             | format_documents
59 |         )
60 |         | prompt
61 |         | llm
62 |     )
63 | 
64 |     return RunnableWithMessageHistory(
65 |         chain,
66 |         get_session_history,
67 |         input_messages_key="question",
68 |         history_messages_key="chat_history",
69 |     ).with_config({"run_name": "chain_answer"})
70 | 
71 | 
72 | async def ask_question(chain: Runnable, question: str, session_id: str):
73 |     async for event in chain.astream_events(
74 |         {"question": question},
75 |         config={
76 |             "callbacks": [ConsoleCallbackHandler()] if Config.DEBUG else [],
77 |             "configurable": {"session_id": session_id},
78 |         },
79 |         version="v2",
80 |         include_names=["context_retriever", "chain_answer"],
81 |     ):
82 |         event_type = event["event"]
83 |         if event_type == "on_retriever_end":
84 |             yield event["data"]["output"]
85 |         if event_type == "on_chain_stream":
86 |             yield event["data"]["chunk"].content
87 | 


--------------------------------------------------------------------------------
/ragbase/config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from pathlib import Path
 3 | 
 4 | 
 5 | class Config:
 6 |     class Path:
 7 |         APP_HOME = Path(os.getenv("APP_HOME", Path(__file__).parent.parent))
 8 |         DATABASE_DIR = APP_HOME / "docs-db"
 9 |         DOCUMENTS_DIR = APP_HOME / "tmp"
10 |         IMAGES_DIR = APP_HOME / "images"
11 | 
12 |     class Database:
13 |         DOCUMENTS_COLLECTION = "documents"
14 | 
15 |     class Model:
16 |         EMBEDDINGS = "BAAI/bge-base-en-v1.5"
17 |         RERANKER = "ms-marco-MiniLM-L-12-v2"
18 |         LOCAL_LLM = "gemma2:9b"
19 |         REMOTE_LLM = "llama-3.1-70b-versatile"
20 |         TEMPERATURE = 0.0
21 |         MAX_TOKENS = 8000
22 |         USE_LOCAL = False
23 | 
24 |     class Retriever:
25 |         USE_RERANKER = True
26 |         USE_CHAIN_FILTER = False
27 | 
28 |     DEBUG = False
29 |     CONVERSATION_MESSAGES_LIMIT = 6
30 | 


--------------------------------------------------------------------------------
/ragbase/ingestor.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from typing import List
 3 | 
 4 | from langchain_community.document_loaders import PyPDFium2Loader
 5 | from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
 6 | from langchain_core.vectorstores import VectorStore
 7 | from langchain_experimental.text_splitter import SemanticChunker
 8 | from langchain_qdrant import Qdrant
 9 | from langchain_text_splitters import RecursiveCharacterTextSplitter
10 | 
11 | from ragbase.config import Config
12 | 
13 | 
14 | class Ingestor:
15 |     def __init__(self):
16 |         self.embeddings = FastEmbedEmbeddings(model_name=Config.Model.EMBEDDINGS)
17 |         self.semantic_splitter = SemanticChunker(
18 |             self.embeddings, breakpoint_threshold_type="interquartile"
19 |         )
20 |         self.recursive_splitter = RecursiveCharacterTextSplitter(
21 |             chunk_size=2048,
22 |             chunk_overlap=128,
23 |             add_start_index=True,
24 |         )
25 | 
26 |     def ingest(self, doc_paths: List[Path]) -> VectorStore:
27 |         documents = []
28 |         for doc_path in doc_paths:
29 |             loaded_documents = PyPDFium2Loader(doc_path).load()
30 |             document_text = "\n".join([doc.page_content for doc in loaded_documents])
31 |             documents.extend(
32 |                 self.recursive_splitter.split_documents(
33 |                     self.semantic_splitter.create_documents([document_text])
34 |                 )
35 |             )
36 |         return Qdrant.from_documents(
37 |             documents=documents,
38 |             embedding=self.embeddings,
39 |             path=Config.Path.DATABASE_DIR,
40 |             collection_name=Config.Database.DOCUMENTS_COLLECTION,
41 |         )
42 | 


--------------------------------------------------------------------------------
/ragbase/model.py:
--------------------------------------------------------------------------------
 1 | from langchain_community.chat_models import ChatOllama
 2 | from langchain_community.document_compressors.flashrank_rerank import FlashrankRerank
 3 | from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
 4 | from langchain_core.language_models import BaseLanguageModel
 5 | from langchain_groq import ChatGroq
 6 | 
 7 | from ragbase.config import Config
 8 | 
 9 | 
10 | def create_llm() -> BaseLanguageModel:
11 |     if Config.Model.USE_LOCAL:
12 |         return ChatOllama(
13 |             model=Config.Model.LOCAL_LLM,
14 |             temperature=Config.Model.TEMPERATURE,
15 |             keep_alive="1h",
16 |             max_tokens=Config.Model.MAX_TOKENS,
17 |         )
18 |     else:
19 |         return ChatGroq(
20 |             temperature=Config.Model.TEMPERATURE,
21 |             model_name=Config.Model.REMOTE_LLM,
22 |             max_tokens=Config.Model.MAX_TOKENS,
23 |         )
24 | 
25 | 
26 | def create_embeddings() -> FastEmbedEmbeddings:
27 |     return FastEmbedEmbeddings(model_name=Config.Model.EMBEDDINGS)
28 | 
29 | 
30 | def create_reranker() -> FlashrankRerank:
31 |     return FlashrankRerank(model=Config.Model.RERANKER)
32 | 


--------------------------------------------------------------------------------
/ragbase/retriever.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | from langchain.retrievers import ContextualCompressionRetriever
 4 | from langchain.retrievers.document_compressors.chain_filter import LLMChainFilter
 5 | from langchain_core.language_models import BaseLanguageModel
 6 | from langchain_core.vectorstores import VectorStore, VectorStoreRetriever
 7 | from langchain_qdrant import Qdrant
 8 | 
 9 | from ragbase.config import Config
10 | from ragbase.model import create_embeddings, create_reranker
11 | 
12 | 
13 | def create_retriever(
14 |     llm: BaseLanguageModel, vector_store: Optional[VectorStore] = None
15 | ) -> VectorStoreRetriever:
16 |     if not vector_store:
17 |         vector_store = Qdrant.from_existing_collection(
18 |             embedding=create_embeddings(),
19 |             collection_name=Config.Database.DOCUMENTS_COLLECTION,
20 |             path=Config.Path.DATABASE_DIR,
21 |         )
22 | 
23 |     retriever = vector_store.as_retriever(
24 |         search_type="similarity", search_kwargs={"k": 5}
25 |     )
26 | 
27 |     if Config.Retriever.USE_RERANKER:
28 |         retriever = ContextualCompressionRetriever(
29 |             base_compressor=create_reranker(), base_retriever=retriever
30 |         )
31 | 
32 |     if Config.Retriever.USE_CHAIN_FILTER:
33 |         retriever = ContextualCompressionRetriever(
34 |             base_compressor=LLMChainFilter.from_llm(llm), base_retriever=retriever
35 |         )
36 | 
37 |     return retriever
38 | 


--------------------------------------------------------------------------------
/ragbase/session_history.py:
--------------------------------------------------------------------------------
 1 | from langchain_community.chat_message_histories import ChatMessageHistory
 2 | 
 3 | store = {}
 4 | 
 5 | 
 6 | def get_session_history(session_id: str) -> ChatMessageHistory:
 7 |     if session_id not in store:
 8 |         store[session_id] = ChatMessageHistory()
 9 |     return store[session_id]
10 | 


--------------------------------------------------------------------------------
/ragbase/uploader.py:
--------------------------------------------------------------------------------
 1 | import shutil
 2 | from pathlib import Path
 3 | from typing import List
 4 | 
 5 | from streamlit.runtime.uploaded_file_manager import UploadedFile
 6 | 
 7 | from ragbase.config import Config
 8 | 
 9 | 
10 | def upload_files(
11 |     files: List[UploadedFile], remove_old_files: bool = True
12 | ) -> List[Path]:
13 |     if remove_old_files:
14 |         shutil.rmtree(Config.Path.DATABASE_DIR, ignore_errors=True)
15 |         shutil.rmtree(Config.Path.DOCUMENTS_DIR, ignore_errors=True)
16 |     Config.Path.DOCUMENTS_DIR.mkdir(parents=True, exist_ok=True)
17 |     file_paths = []
18 |     for file in files:
19 |         file_path = Config.Path.DOCUMENTS_DIR / file.name
20 |         with file_path.open("wb") as f:
21 |             f.write(file.getvalue())
22 |         file_paths.append(file_path)
23 |     return file_paths
24 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | fastembed==0.3.3
 2 | flashrank==0.2.8
 3 | langchain-community==0.2.6
 4 | langchain-experimental==0.0.62
 5 | langchain-groq==0.1.6
 6 | langchain-qdrant==0.1.1
 7 | langchain==0.2.6
 8 | python-dotenv==1.0.1
 9 | qdrant-client==1.10.1
10 | streamlit==1.36.0
11 | pypdfium2==4.30.0


--------------------------------------------------------------------------------