├── .env.example
├── .github
├── architecture.png
└── ui.png
├── .gitignore
├── .streamlit
└── config.toml
├── .vscode
└── settings.json
├── LICENSE
├── README.md
├── app.py
├── images
├── assistant-avatar.png
└── user-avatar.png
├── notebooks
└── reranking-and-filtering.ipynb
├── poetry.lock
├── pyproject.toml
├── ragbase
├── __init__.py
├── chain.py
├── config.py
├── ingestor.py
├── model.py
├── retriever.py
├── session_history.py
└── uploader.py
└── requirements.txt
/.env.example:
--------------------------------------------------------------------------------
1 | GROQ_API_KEY="GROQ API KEY HERE"
--------------------------------------------------------------------------------
/.github/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/curiousily/ragbase/5af7b79162329fa0b725b39d1aeeea7eb1e4428b/.github/architecture.png
--------------------------------------------------------------------------------
/.github/ui.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/curiousily/ragbase/5af7b79162329fa0b725b39d1aeeea7eb1e4428b/.github/ui.png
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110 | .pdm.toml
111 | .pdm-python
112 | .pdm-build/
113 |
114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115 | __pypackages__/
116 |
117 | # Celery stuff
118 | celerybeat-schedule
119 | celerybeat.pid
120 |
121 | # SageMath parsed files
122 | *.sage.py
123 |
124 | # Environments
125 | .env
126 | .venv
127 | env/
128 | venv/
129 | ENV/
130 | env.bak/
131 | venv.bak/
132 |
133 | # Spyder project settings
134 | .spyderproject
135 | .spyproject
136 |
137 | # Rope project settings
138 | .ropeproject
139 |
140 | # mkdocs documentation
141 | /site
142 |
143 | # mypy
144 | .mypy_cache/
145 | .dmypy.json
146 | dmypy.json
147 |
148 | # Pyre type checker
149 | .pyre/
150 |
151 | # pytype static type analyzer
152 | .pytype/
153 |
154 | # Cython debug symbols
155 | cython_debug/
156 |
157 | # PyCharm
158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160 | # and can be added to the global gitignore or merged into this file. For a more nuclear
161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
162 | #.idea/
163 |
164 | .DS_Store
165 | docs-db
166 | tmp
--------------------------------------------------------------------------------
/.streamlit/config.toml:
--------------------------------------------------------------------------------
1 | [server]
2 | maxUploadSize = 2
3 |
4 | [theme]
5 | base="dark"
6 | primaryColor="#0284c7"
7 | backgroundColor="#111111"
8 | secondaryBackgroundColor="#1e1e1e"
--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 | "[python]": {
3 | "editor.formatOnSave": true,
4 | "editor.defaultFormatter": "charliermarsh.ruff",
5 | "editor.codeActionsOnSave": {
6 | "source.fixAll": "never",
7 | "source.organizeImports": "explicit"
8 | }
9 | }
10 | }
11 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 Venelin Valkov
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # RagBase - Private Chat with Your Documents
2 |
3 | > Completely local RAG with chat UI
4 |
5 |
6 |
7 |
8 |
9 | ## Demo
10 |
11 | Check out the [RagBase on Streamlit Cloud](https://ragbase.streamlit.app/). Runs with Groq API.
12 |
13 | ## Installation
14 |
15 | Clone the repo:
16 |
17 | ```sh
18 | git clone git@github.com:curiousily/ragbase.git
19 | cd ragbase
20 | ```
21 |
22 | Install the dependencies (requires Poetry):
23 |
24 | ```sh
25 | poetry install
26 | ```
27 |
28 | Fetch your LLM (gemma2:9b by default):
29 |
30 | ```sh
31 | ollama pull gemma2:9b
32 | ```
33 |
34 | Run the Ollama server
35 |
36 | ```sh
37 | ollama serve
38 | ```
39 |
40 | Start RagBase:
41 |
42 | ```sh
43 | poetry run streamlit run app.py
44 | ```
45 |
46 | ## Architecture
47 |
48 |
49 |
50 |
51 |
52 | ### Ingestor
53 |
54 | Extracts text from PDF documents and creates chunks (using semantic and character splitter) that are stored in a vector databse
55 |
56 | ### Retriever
57 |
58 | Given a query, searches for similar documents, reranks the result and applies LLM chain filter before returning the response.
59 |
60 | ### QA Chain
61 |
62 | Combines the LLM with the retriever to answer a given user question
63 |
64 | ## Tech Stack
65 |
66 | - [Ollama](https://ollama.com/) - run local LLM
67 | - [Groq API](https://groq.com/) - fast inference for mutliple LLMs
68 | - [LangChain](https://www.langchain.com/) - build LLM-powered apps
69 | - [Qdrant](https://qdrant.tech/) - vector search/database
70 | - [FlashRank](https://github.com/PrithivirajDamodaran/FlashRank) - fast reranking
71 | - [FastEmbed](https://qdrant.github.io/fastembed/) - lightweight and fast embedding generation
72 | - [Streamlit](https://streamlit.io/) - build UI for data apps
73 | - [PDFium](https://pdfium.googlesource.com/pdfium/) - PDF processing and text extraction
74 |
75 | ## Add Groq API Key (Optional)
76 |
77 | You can also use the Groq API to replace the local LLM, for that you'll need a `.env` file with Groq API key:
78 |
79 | ```sh
80 | GROQ_API_KEY=YOUR API KEY
81 | ```
--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | import random
3 |
4 | import streamlit as st
5 | from dotenv import load_dotenv
6 |
7 | from ragbase.chain import ask_question, create_chain
8 | from ragbase.config import Config
9 | from ragbase.ingestor import Ingestor
10 | from ragbase.model import create_llm
11 | from ragbase.retriever import create_retriever
12 | from ragbase.uploader import upload_files
13 |
14 | load_dotenv()
15 |
16 | LOADING_MESSAGES = [
17 | "Calculating your answer through multiverse...",
18 | "Adjusting quantum entanglement...",
19 | "Summoning star wisdom... almost there!",
20 | "Consulting Schrödinger's cat...",
21 | "Warping spacetime for your response...",
22 | "Balancing neutron star equations...",
23 | "Analyzing dark matter... please wait...",
24 | "Engaging hyperdrive... en route!",
25 | "Gathering photons from a galaxy...",
26 | "Beaming data from Andromeda... stand by!",
27 | ]
28 |
29 |
30 | @st.cache_resource(show_spinner=False)
31 | def build_qa_chain(files):
32 | file_paths = upload_files(files)
33 | vector_store = Ingestor().ingest(file_paths)
34 | llm = create_llm()
35 | retriever = create_retriever(llm, vector_store=vector_store)
36 | return create_chain(llm, retriever)
37 |
38 |
39 | async def ask_chain(question: str, chain):
40 | full_response = ""
41 | assistant = st.chat_message(
42 | "assistant", avatar=str(Config.Path.IMAGES_DIR / "assistant-avatar.png")
43 | )
44 | with assistant:
45 | message_placeholder = st.empty()
46 | message_placeholder.status(random.choice(LOADING_MESSAGES), state="running")
47 | documents = []
48 | async for event in ask_question(chain, question, session_id="session-id-42"):
49 | if type(event) is str:
50 | full_response += event
51 | message_placeholder.markdown(full_response)
52 | if type(event) is list:
53 | documents.extend(event)
54 | for i, doc in enumerate(documents):
55 | with st.expander(f"Source #{i+1}"):
56 | st.write(doc.page_content)
57 |
58 | st.session_state.messages.append({"role": "assistant", "content": full_response})
59 |
60 |
61 | def show_upload_documents():
62 | holder = st.empty()
63 | with holder.container():
64 | st.header("RagBase")
65 | st.subheader("Get answers from your documents")
66 | uploaded_files = st.file_uploader(
67 | label="Upload PDF files", type=["pdf"], accept_multiple_files=True
68 | )
69 | if not uploaded_files:
70 | st.warning("Please upload PDF documents to continue!")
71 | st.stop()
72 |
73 | with st.spinner("Analyzing your document(s)..."):
74 | holder.empty()
75 | return build_qa_chain(uploaded_files)
76 |
77 |
78 | def show_message_history():
79 | for message in st.session_state.messages:
80 | role = message["role"]
81 | avatar_path = (
82 | Config.Path.IMAGES_DIR / "assistant-avatar.png"
83 | if role == "assistant"
84 | else Config.Path.IMAGES_DIR / "user-avatar.png"
85 | )
86 | with st.chat_message(role, avatar=str(avatar_path)):
87 | st.markdown(message["content"])
88 |
89 |
90 | def show_chat_input(chain):
91 | if prompt := st.chat_input("Ask your question here"):
92 | st.session_state.messages.append({"role": "user", "content": prompt})
93 | with st.chat_message(
94 | "user",
95 | avatar=str(Config.Path.IMAGES_DIR / "user-avatar.png"),
96 | ):
97 | st.markdown(prompt)
98 | asyncio.run(ask_chain(prompt, chain))
99 |
100 |
101 | st.set_page_config(page_title="RagBase", page_icon="🐧")
102 |
103 | st.html(
104 | """
105 |
111 | """
112 | )
113 |
114 | if "messages" not in st.session_state:
115 | st.session_state.messages = [
116 | {
117 | "role": "assistant",
118 | "content": "Hi! What do you want to know about your documents?",
119 | }
120 | ]
121 |
122 | if Config.CONVERSATION_MESSAGES_LIMIT > 0 and Config.CONVERSATION_MESSAGES_LIMIT <= len(
123 | st.session_state.messages
124 | ):
125 | st.warning(
126 | "You have reached the conversation limit. Refresh the page to start a new conversation."
127 | )
128 | st.stop()
129 |
130 | chain = show_upload_documents()
131 | show_message_history()
132 | show_chat_input(chain)
133 |
--------------------------------------------------------------------------------
/images/assistant-avatar.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/curiousily/ragbase/5af7b79162329fa0b725b39d1aeeea7eb1e4428b/images/assistant-avatar.png
--------------------------------------------------------------------------------
/images/user-avatar.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/curiousily/ragbase/5af7b79162329fa0b725b39d1aeeea7eb1e4428b/images/user-avatar.png
--------------------------------------------------------------------------------
/notebooks/reranking-and-filtering.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "from langchain_core.documents import Document\n",
10 | "from ragbase.model import create_reranker, create_llm\n",
11 | "from langchain.retrievers.document_compressors.chain_filter import LLMChainFilter\n",
12 | "from langchain.globals import set_verbose\n",
13 | "\n",
14 | "set_verbose(True)"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 2,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "reranker = create_reranker()"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": 3,
29 | "metadata": {},
30 | "outputs": [],
31 | "source": [
32 | "documents = [\n",
33 | " Document(\"\"\"Not since Porsche blew air cooling out the back door in 1999 with the 996-\n",
34 | "generation 911 has there been a bigger change that will upset more Porsche fans\n",
35 | "than the hybridization of the 911. Fans, however, are not necessarily owners, and\n",
36 | "those with the means will not mind the 21st-century tech one bit, because if\n",
37 | "Porsche didn't tell anyone this updated 992.2 was a hybrid, no one would know by\n",
38 | "how it drives.\n",
39 | "\"\"\"),\n",
40 | " Document(\"\"\"For now, the new 3.6-liter flat-six operating in perfect stoichiometry all the time,\n",
41 | "its electrified turbocharger, and eight-speed dual-clutch gearbox with an integral\n",
42 | "electric motor are limited to the GTS, which used to be powered by a jazzed-up\n",
43 | "version of the twin-turbo 3.0-liter in the base Carrera.\n",
44 | "\"\"\"),\n",
45 | " Document(\"\"\"\n",
46 | "Anyway, the screen is cheaper, but at least Porsche does some cool stuff with it.\n",
47 | "There are seven different views, including a classic five-gauge cluster, but the most\n",
48 | "interesting of them is a track-focused mode that clocks the tach so that the redline\n",
49 | "is near 12 o'clock. Porsche would have scored more points if it had kept the \n",
50 | "orientation of the tach numbers so that they locked with the twist, like a real\n",
51 | "clocked tach out of a 917.\n",
52 | "\"\"\"),\n",
53 | " Document(\"\"\"The bad news is that the GTS coupe starts at $166,895. The back seat is no longer\n",
54 | "standard, though adding the +2 second row is a no-cost option. That's mega\n",
55 | "money, but until Porsche sees sales drop, you better believe it will keep raising its\n",
56 | "prices. That's just basic supply-demand, and plenty of people will line up for this\n",
57 | "water-cooled hybrid.\n",
58 | "\"\"\")\n",
59 | "]"
60 | ]
61 | },
62 | {
63 | "cell_type": "markdown",
64 | "metadata": {},
65 | "source": [
66 | "## Reranker"
67 | ]
68 | },
69 | {
70 | "cell_type": "code",
71 | "execution_count": 4,
72 | "metadata": {},
73 | "outputs": [],
74 | "source": [
75 | "reranked_documents = reranker.compress_documents(documents, \"What is the price?\")"
76 | ]
77 | },
78 | {
79 | "cell_type": "code",
80 | "execution_count": 5,
81 | "metadata": {},
82 | "outputs": [
83 | {
84 | "name": "stdout",
85 | "output_type": "stream",
86 | "text": [
87 | "dict_keys(['id', 'metadata', 'page_content', 'type'])\n",
88 | "score: 0.12178549\n",
89 | "The bad news is that the GTS coupe starts at $166,895. The back seat is no longer\n",
90 | "standard, though adding the +2 second row is a no-cost option. That's mega\n",
91 | "money, but until Porsche sees sales drop, you better believe it will keep raising its\n",
92 | "prices. That's just basic supply-demand, and plenty of people will line up for this\n",
93 | "water-cooled hybrid.\n",
94 | "\n"
95 | ]
96 | }
97 | ],
98 | "source": [
99 | "first_result = reranked_documents[0]\n",
100 | "print(first_result.__dict__.keys())\n",
101 | "print(\"score:\", first_result.metadata[\"relevance_score\"])\n",
102 | "print(first_result.page_content)"
103 | ]
104 | },
105 | {
106 | "cell_type": "markdown",
107 | "metadata": {},
108 | "source": [
109 | "## LLM Chain Filter"
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": 6,
115 | "metadata": {},
116 | "outputs": [
117 | {
118 | "name": "stdout",
119 | "output_type": "stream",
120 | "text": [
121 | "\n",
122 | "\n",
123 | "\u001b[1m> Entering new LLMChain chain...\u001b[0m\n",
124 | "\n",
125 | "\u001b[1m> Entering new LLMChain chain...\u001b[0m\n",
126 | "Prompt after formatting:\n",
127 | "\u001b[32;1m\u001b[1;3mGiven the following question and context, return YES if the context is relevant to the question and NO if it isn't.\n",
128 | "\n",
129 | "> Question: How many seats in the car?\n",
130 | "> Context:\n",
131 | ">>>\n",
132 | "For now, the new 3.6-liter flat-six operating in perfect stoichiometry all the time,\n",
133 | "its electrified turbocharger, and eight-speed dual-clutch gearbox with an integral\n",
134 | "electric motor are limited to the GTS, which used to be powered by a jazzed-up\n",
135 | "version of the twin-turbo 3.0-liter in the base Carrera.\n",
136 | "\n",
137 | ">>>\n",
138 | "> Relevant (YES / NO):\u001b[0m\n",
139 | "\n",
140 | "Prompt after formatting:\n",
141 | "\u001b[32;1m\u001b[1;3mGiven the following question and context, return YES if the context is relevant to the question and NO if it isn't.\n",
142 | "\n",
143 | "> Question: How many seats in the car?\n",
144 | "> Context:\n",
145 | ">>>\n",
146 | "Not since Porsche blew air cooling out the back door in 1999 with the 996-\n",
147 | "generation 911 has there been a bigger change that will upset more Porsche fans\n",
148 | "than the hybridization of the 911. Fans, however, are not necessarily owners, and\n",
149 | "those with the means will not mind the 21st-century tech one bit, because if\n",
150 | "Porsche didn't tell anyone this updated 992.2 was a hybrid, no one would know by\n",
151 | "how it drives.\n",
152 | "\n",
153 | ">>>\n",
154 | "> Relevant (YES / NO):\u001b[0m\n",
155 | "\n",
156 | "\n",
157 | "\u001b[1m> Entering new LLMChain chain...\u001b[0m\n",
158 | "Prompt after formatting:\n",
159 | "\u001b[32;1m\u001b[1;3mGiven the following question and context, return YES if the context is relevant to the question and NO if it isn't.\n",
160 | "\n",
161 | "> Question: How many seats in the car?\n",
162 | "> Context:\n",
163 | ">>>\n",
164 | "\n",
165 | "Anyway, the screen is cheaper, but at least Porsche does some cool stuff with it.\n",
166 | "There are seven different views, including a classic five-gauge cluster, but the most\n",
167 | "interesting of them is a track-focused mode that clocks the tach so that the redline\n",
168 | "is near 12 o'clock. Porsche would have scored more points if it had kept the \n",
169 | "orientation of the tach numbers so that they locked with the twist, like a real\n",
170 | "clocked tach out of a 917.\n",
171 | "\n",
172 | ">>>\n",
173 | "> Relevant (YES / NO):\u001b[0m\n",
174 | "\n",
175 | "\n",
176 | "\u001b[1m> Entering new LLMChain chain...\u001b[0m\n",
177 | "Prompt after formatting:\n",
178 | "\u001b[32;1m\u001b[1;3mGiven the following question and context, return YES if the context is relevant to the question and NO if it isn't.\n",
179 | "\n",
180 | "> Question: How many seats in the car?\n",
181 | "> Context:\n",
182 | ">>>\n",
183 | "The bad news is that the GTS coupe starts at $166,895. The back seat is no longer\n",
184 | "standard, though adding the +2 second row is a no-cost option. That's mega\n",
185 | "money, but until Porsche sees sales drop, you better believe it will keep raising its\n",
186 | "prices. That's just basic supply-demand, and plenty of people will line up for this\n",
187 | "water-cooled hybrid.\n",
188 | "\n",
189 | ">>>\n",
190 | "> Relevant (YES / NO):\u001b[0m\n"
191 | ]
192 | },
193 | {
194 | "name": "stderr",
195 | "output_type": "stream",
196 | "text": [
197 | "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
198 | "To disable this warning, you can either:\n",
199 | "\t- Avoid using `tokenizers` before the fork if possible\n",
200 | "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
201 | "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
202 | "To disable this warning, you can either:\n",
203 | "\t- Avoid using `tokenizers` before the fork if possible\n",
204 | "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
205 | "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
206 | "To disable this warning, you can either:\n",
207 | "\t- Avoid using `tokenizers` before the fork if possible\n",
208 | "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
209 | "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
210 | "To disable this warning, you can either:\n",
211 | "\t- Avoid using `tokenizers` before the fork if possible\n",
212 | "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
213 | "INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
214 | "INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions \"HTTP/1.1 200 OK\"\n",
215 | "INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions \"HTTP/1.1 200 OK\"\n"
216 | ]
217 | },
218 | {
219 | "name": "stdout",
220 | "output_type": "stream",
221 | "text": [
222 | "\n",
223 | "\u001b[1m> Finished chain.\u001b[0m\n",
224 | "\n",
225 | "\u001b[1m> Finished chain.\u001b[0m\n",
226 | "\n",
227 | "\u001b[1m> Finished chain.\u001b[0m\n"
228 | ]
229 | },
230 | {
231 | "name": "stderr",
232 | "output_type": "stream",
233 | "text": [
234 | "INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions \"HTTP/1.1 200 OK\"\n"
235 | ]
236 | },
237 | {
238 | "name": "stdout",
239 | "output_type": "stream",
240 | "text": [
241 | "\n",
242 | "\u001b[1m> Finished chain.\u001b[0m\n"
243 | ]
244 | }
245 | ],
246 | "source": [
247 | "chain_filter = LLMChainFilter.from_llm(create_llm())\n",
248 | "filtered_documents = chain_filter.compress_documents(documents, \"How many seats in the car?\")"
249 | ]
250 | },
251 | {
252 | "cell_type": "code",
253 | "execution_count": 7,
254 | "metadata": {},
255 | "outputs": [
256 | {
257 | "name": "stdout",
258 | "output_type": "stream",
259 | "text": [
260 | "1\n",
261 | "The bad news is that the GTS coupe starts at $166,895. The back seat is no longer\n",
262 | "standard, though adding the +2 second row is a no-cost option. That's mega\n",
263 | "money, but until Porsche sees sales drop, you better believe it will keep raising its\n",
264 | "prices. That's just basic supply-demand, and plenty of people will line up for this\n",
265 | "water-cooled hybrid.\n",
266 | "\n"
267 | ]
268 | }
269 | ],
270 | "source": [
271 | "print(len(filtered_documents))\n",
272 | "print(filtered_documents[0].page_content)"
273 | ]
274 | }
275 | ],
276 | "metadata": {
277 | "kernelspec": {
278 | "display_name": "ragbase-YTjELISx-py3.12",
279 | "language": "python",
280 | "name": "python3"
281 | },
282 | "language_info": {
283 | "codemirror_mode": {
284 | "name": "ipython",
285 | "version": 3
286 | },
287 | "file_extension": ".py",
288 | "mimetype": "text/x-python",
289 | "name": "python",
290 | "nbconvert_exporter": "python",
291 | "pygments_lexer": "ipython3",
292 | "version": "3.12.4"
293 | }
294 | },
295 | "nbformat": 4,
296 | "nbformat_minor": 2
297 | }
298 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.poetry]
2 | name = "ragbase"
3 | version = "0.1.0"
4 | description = ""
5 | authors = ["Venelin Valkov "]
6 | license = "MIT"
7 | readme = "README.md"
8 |
9 | [tool.poetry.dependencies]
10 | python = ">=3.12,<3.13"
11 | langchain-community = "^0.2.6"
12 | flashrank = "^0.2.8"
13 | qdrant-client = "^1.10.1"
14 | pypdfium2 = "^4.30.0"
15 | fastembed = "^0.3.3"
16 | langchain-experimental = "^0.0.62"
17 | langchain-qdrant = "^0.1.1"
18 | streamlit = "^1.36.0"
19 | python-dotenv = "^1.0.1"
20 | langchain-groq = "^0.1.6"
21 |
22 |
23 | [tool.poetry.group.dev.dependencies]
24 | ruff = "^0.5.1"
25 | jupyterlab = "^4.2.3"
26 | watchdog = "^4.0.1"
27 |
28 | [build-system]
29 | requires = ["poetry-core"]
30 | build-backend = "poetry.core.masonry.api"
31 |
--------------------------------------------------------------------------------
/ragbase/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/curiousily/ragbase/5af7b79162329fa0b725b39d1aeeea7eb1e4428b/ragbase/__init__.py
--------------------------------------------------------------------------------
/ragbase/chain.py:
--------------------------------------------------------------------------------
1 | import re
2 | from operator import itemgetter
3 | from typing import List
4 |
5 | from langchain.schema.runnable import RunnablePassthrough
6 | from langchain_core.documents import Document
7 | from langchain_core.language_models import BaseLanguageModel
8 | from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
9 | from langchain_core.runnables import Runnable
10 | from langchain_core.runnables.history import RunnableWithMessageHistory
11 | from langchain_core.tracers.stdout import ConsoleCallbackHandler
12 | from langchain_core.vectorstores import VectorStoreRetriever
13 |
14 | from ragbase.config import Config
15 | from ragbase.session_history import get_session_history
16 |
17 | SYSTEM_PROMPT = """
18 | Utilize the provided contextual information to respond to the user question.
19 | If the answer is not found within the context, state that the answer cannot be found.
20 | Prioritize concise responses (maximum of 3 sentences) and use a list where applicable.
21 | The contextual information is organized with the most relevant source appearing first.
22 | Each source is separated by a horizontal rule (---).
23 |
24 | Context:
25 | {context}
26 |
27 | Use markdown formatting where appropriate.
28 | """
29 |
30 |
31 | def remove_links(text: str) -> str:
32 | url_pattern = r"https?://\S+|www\.\S+"
33 | return re.sub(url_pattern, "", text)
34 |
35 |
36 | def format_documents(documents: List[Document]) -> str:
37 | texts = []
38 | for doc in documents:
39 | texts.append(doc.page_content)
40 | texts.append("---")
41 |
42 | return remove_links("\n".join(texts))
43 |
44 |
45 | def create_chain(llm: BaseLanguageModel, retriever: VectorStoreRetriever) -> Runnable:
46 | prompt = ChatPromptTemplate.from_messages(
47 | [
48 | ("system", SYSTEM_PROMPT),
49 | MessagesPlaceholder("chat_history"),
50 | ("human", "{question}"),
51 | ]
52 | )
53 |
54 | chain = (
55 | RunnablePassthrough.assign(
56 | context=itemgetter("question")
57 | | retriever.with_config({"run_name": "context_retriever"})
58 | | format_documents
59 | )
60 | | prompt
61 | | llm
62 | )
63 |
64 | return RunnableWithMessageHistory(
65 | chain,
66 | get_session_history,
67 | input_messages_key="question",
68 | history_messages_key="chat_history",
69 | ).with_config({"run_name": "chain_answer"})
70 |
71 |
72 | async def ask_question(chain: Runnable, question: str, session_id: str):
73 | async for event in chain.astream_events(
74 | {"question": question},
75 | config={
76 | "callbacks": [ConsoleCallbackHandler()] if Config.DEBUG else [],
77 | "configurable": {"session_id": session_id},
78 | },
79 | version="v2",
80 | include_names=["context_retriever", "chain_answer"],
81 | ):
82 | event_type = event["event"]
83 | if event_type == "on_retriever_end":
84 | yield event["data"]["output"]
85 | if event_type == "on_chain_stream":
86 | yield event["data"]["chunk"].content
87 |
--------------------------------------------------------------------------------
/ragbase/config.py:
--------------------------------------------------------------------------------
1 | import os
2 | from pathlib import Path
3 |
4 |
5 | class Config:
6 | class Path:
7 | APP_HOME = Path(os.getenv("APP_HOME", Path(__file__).parent.parent))
8 | DATABASE_DIR = APP_HOME / "docs-db"
9 | DOCUMENTS_DIR = APP_HOME / "tmp"
10 | IMAGES_DIR = APP_HOME / "images"
11 |
12 | class Database:
13 | DOCUMENTS_COLLECTION = "documents"
14 |
15 | class Model:
16 | EMBEDDINGS = "BAAI/bge-base-en-v1.5"
17 | RERANKER = "ms-marco-MiniLM-L-12-v2"
18 | LOCAL_LLM = "gemma2:9b"
19 | REMOTE_LLM = "llama-3.1-70b-versatile"
20 | TEMPERATURE = 0.0
21 | MAX_TOKENS = 8000
22 | USE_LOCAL = False
23 |
24 | class Retriever:
25 | USE_RERANKER = True
26 | USE_CHAIN_FILTER = False
27 |
28 | DEBUG = False
29 | CONVERSATION_MESSAGES_LIMIT = 6
30 |
--------------------------------------------------------------------------------
/ragbase/ingestor.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | from typing import List
3 |
4 | from langchain_community.document_loaders import PyPDFium2Loader
5 | from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
6 | from langchain_core.vectorstores import VectorStore
7 | from langchain_experimental.text_splitter import SemanticChunker
8 | from langchain_qdrant import Qdrant
9 | from langchain_text_splitters import RecursiveCharacterTextSplitter
10 |
11 | from ragbase.config import Config
12 |
13 |
14 | class Ingestor:
15 | def __init__(self):
16 | self.embeddings = FastEmbedEmbeddings(model_name=Config.Model.EMBEDDINGS)
17 | self.semantic_splitter = SemanticChunker(
18 | self.embeddings, breakpoint_threshold_type="interquartile"
19 | )
20 | self.recursive_splitter = RecursiveCharacterTextSplitter(
21 | chunk_size=2048,
22 | chunk_overlap=128,
23 | add_start_index=True,
24 | )
25 |
26 | def ingest(self, doc_paths: List[Path]) -> VectorStore:
27 | documents = []
28 | for doc_path in doc_paths:
29 | loaded_documents = PyPDFium2Loader(doc_path).load()
30 | document_text = "\n".join([doc.page_content for doc in loaded_documents])
31 | documents.extend(
32 | self.recursive_splitter.split_documents(
33 | self.semantic_splitter.create_documents([document_text])
34 | )
35 | )
36 | return Qdrant.from_documents(
37 | documents=documents,
38 | embedding=self.embeddings,
39 | path=Config.Path.DATABASE_DIR,
40 | collection_name=Config.Database.DOCUMENTS_COLLECTION,
41 | )
42 |
--------------------------------------------------------------------------------
/ragbase/model.py:
--------------------------------------------------------------------------------
1 | from langchain_community.chat_models import ChatOllama
2 | from langchain_community.document_compressors.flashrank_rerank import FlashrankRerank
3 | from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
4 | from langchain_core.language_models import BaseLanguageModel
5 | from langchain_groq import ChatGroq
6 |
7 | from ragbase.config import Config
8 |
9 |
10 | def create_llm() -> BaseLanguageModel:
11 | if Config.Model.USE_LOCAL:
12 | return ChatOllama(
13 | model=Config.Model.LOCAL_LLM,
14 | temperature=Config.Model.TEMPERATURE,
15 | keep_alive="1h",
16 | max_tokens=Config.Model.MAX_TOKENS,
17 | )
18 | else:
19 | return ChatGroq(
20 | temperature=Config.Model.TEMPERATURE,
21 | model_name=Config.Model.REMOTE_LLM,
22 | max_tokens=Config.Model.MAX_TOKENS,
23 | )
24 |
25 |
26 | def create_embeddings() -> FastEmbedEmbeddings:
27 | return FastEmbedEmbeddings(model_name=Config.Model.EMBEDDINGS)
28 |
29 |
30 | def create_reranker() -> FlashrankRerank:
31 | return FlashrankRerank(model=Config.Model.RERANKER)
32 |
--------------------------------------------------------------------------------
/ragbase/retriever.py:
--------------------------------------------------------------------------------
1 | from typing import Optional
2 |
3 | from langchain.retrievers import ContextualCompressionRetriever
4 | from langchain.retrievers.document_compressors.chain_filter import LLMChainFilter
5 | from langchain_core.language_models import BaseLanguageModel
6 | from langchain_core.vectorstores import VectorStore, VectorStoreRetriever
7 | from langchain_qdrant import Qdrant
8 |
9 | from ragbase.config import Config
10 | from ragbase.model import create_embeddings, create_reranker
11 |
12 |
13 | def create_retriever(
14 | llm: BaseLanguageModel, vector_store: Optional[VectorStore] = None
15 | ) -> VectorStoreRetriever:
16 | if not vector_store:
17 | vector_store = Qdrant.from_existing_collection(
18 | embedding=create_embeddings(),
19 | collection_name=Config.Database.DOCUMENTS_COLLECTION,
20 | path=Config.Path.DATABASE_DIR,
21 | )
22 |
23 | retriever = vector_store.as_retriever(
24 | search_type="similarity", search_kwargs={"k": 5}
25 | )
26 |
27 | if Config.Retriever.USE_RERANKER:
28 | retriever = ContextualCompressionRetriever(
29 | base_compressor=create_reranker(), base_retriever=retriever
30 | )
31 |
32 | if Config.Retriever.USE_CHAIN_FILTER:
33 | retriever = ContextualCompressionRetriever(
34 | base_compressor=LLMChainFilter.from_llm(llm), base_retriever=retriever
35 | )
36 |
37 | return retriever
38 |
--------------------------------------------------------------------------------
/ragbase/session_history.py:
--------------------------------------------------------------------------------
1 | from langchain_community.chat_message_histories import ChatMessageHistory
2 |
3 | store = {}
4 |
5 |
6 | def get_session_history(session_id: str) -> ChatMessageHistory:
7 | if session_id not in store:
8 | store[session_id] = ChatMessageHistory()
9 | return store[session_id]
10 |
--------------------------------------------------------------------------------
/ragbase/uploader.py:
--------------------------------------------------------------------------------
1 | import shutil
2 | from pathlib import Path
3 | from typing import List
4 |
5 | from streamlit.runtime.uploaded_file_manager import UploadedFile
6 |
7 | from ragbase.config import Config
8 |
9 |
10 | def upload_files(
11 | files: List[UploadedFile], remove_old_files: bool = True
12 | ) -> List[Path]:
13 | if remove_old_files:
14 | shutil.rmtree(Config.Path.DATABASE_DIR, ignore_errors=True)
15 | shutil.rmtree(Config.Path.DOCUMENTS_DIR, ignore_errors=True)
16 | Config.Path.DOCUMENTS_DIR.mkdir(parents=True, exist_ok=True)
17 | file_paths = []
18 | for file in files:
19 | file_path = Config.Path.DOCUMENTS_DIR / file.name
20 | with file_path.open("wb") as f:
21 | f.write(file.getvalue())
22 | file_paths.append(file_path)
23 | return file_paths
24 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | fastembed==0.3.3
2 | flashrank==0.2.8
3 | langchain-community==0.2.6
4 | langchain-experimental==0.0.62
5 | langchain-groq==0.1.6
6 | langchain-qdrant==0.1.1
7 | langchain==0.2.6
8 | python-dotenv==1.0.1
9 | qdrant-client==1.10.1
10 | streamlit==1.36.0
11 | pypdfium2==4.30.0
--------------------------------------------------------------------------------