├── .env.example ├── .github ├── architecture.png └── ui.png ├── .gitignore ├── .streamlit └── config.toml ├── .vscode └── settings.json ├── LICENSE ├── README.md ├── app.py ├── images ├── assistant-avatar.png └── user-avatar.png ├── notebooks └── reranking-and-filtering.ipynb ├── poetry.lock ├── pyproject.toml ├── ragbase ├── __init__.py ├── chain.py ├── config.py ├── ingestor.py ├── model.py ├── retriever.py ├── session_history.py └── uploader.py └── requirements.txt /.env.example: -------------------------------------------------------------------------------- 1 | GROQ_API_KEY="GROQ API KEY HERE" -------------------------------------------------------------------------------- /.github/architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/curiousily/ragbase/5af7b79162329fa0b725b39d1aeeea7eb1e4428b/.github/architecture.png -------------------------------------------------------------------------------- /.github/ui.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/curiousily/ragbase/5af7b79162329fa0b725b39d1aeeea7eb1e4428b/.github/ui.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 110 | .pdm.toml 111 | .pdm-python 112 | .pdm-build/ 113 | 114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 115 | __pypackages__/ 116 | 117 | # Celery stuff 118 | celerybeat-schedule 119 | celerybeat.pid 120 | 121 | # SageMath parsed files 122 | *.sage.py 123 | 124 | # Environments 125 | .env 126 | .venv 127 | env/ 128 | venv/ 129 | ENV/ 130 | env.bak/ 131 | venv.bak/ 132 | 133 | # Spyder project settings 134 | .spyderproject 135 | .spyproject 136 | 137 | # Rope project settings 138 | .ropeproject 139 | 140 | # mkdocs documentation 141 | /site 142 | 143 | # mypy 144 | .mypy_cache/ 145 | .dmypy.json 146 | dmypy.json 147 | 148 | # Pyre type checker 149 | .pyre/ 150 | 151 | # pytype static type analyzer 152 | .pytype/ 153 | 154 | # Cython debug symbols 155 | cython_debug/ 156 | 157 | # PyCharm 158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 160 | # and can be added to the global gitignore or merged into this file. For a more nuclear 161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 162 | #.idea/ 163 | 164 | .DS_Store 165 | docs-db 166 | tmp -------------------------------------------------------------------------------- /.streamlit/config.toml: -------------------------------------------------------------------------------- 1 | [server] 2 | maxUploadSize = 2 3 | 4 | [theme] 5 | base="dark" 6 | primaryColor="#0284c7" 7 | backgroundColor="#111111" 8 | secondaryBackgroundColor="#1e1e1e" -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "[python]": { 3 | "editor.formatOnSave": true, 4 | "editor.defaultFormatter": "charliermarsh.ruff", 5 | "editor.codeActionsOnSave": { 6 | "source.fixAll": "never", 7 | "source.organizeImports": "explicit" 8 | } 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Venelin Valkov 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RagBase - Private Chat with Your Documents 2 | 3 | > Completely local RAG with chat UI 4 | 5 | 6 | 7 | 8 | 9 | ## Demo 10 | 11 | Check out the [RagBase on Streamlit Cloud](https://ragbase.streamlit.app/). Runs with Groq API. 12 | 13 | ## Installation 14 | 15 | Clone the repo: 16 | 17 | ```sh 18 | git clone git@github.com:curiousily/ragbase.git 19 | cd ragbase 20 | ``` 21 | 22 | Install the dependencies (requires Poetry): 23 | 24 | ```sh 25 | poetry install 26 | ``` 27 | 28 | Fetch your LLM (gemma2:9b by default): 29 | 30 | ```sh 31 | ollama pull gemma2:9b 32 | ``` 33 | 34 | Run the Ollama server 35 | 36 | ```sh 37 | ollama serve 38 | ``` 39 | 40 | Start RagBase: 41 | 42 | ```sh 43 | poetry run streamlit run app.py 44 | ``` 45 | 46 | ## Architecture 47 | 48 | 49 | 50 | 51 | 52 | ### Ingestor 53 | 54 | Extracts text from PDF documents and creates chunks (using semantic and character splitter) that are stored in a vector databse 55 | 56 | ### Retriever 57 | 58 | Given a query, searches for similar documents, reranks the result and applies LLM chain filter before returning the response. 59 | 60 | ### QA Chain 61 | 62 | Combines the LLM with the retriever to answer a given user question 63 | 64 | ## Tech Stack 65 | 66 | - [Ollama](https://ollama.com/) - run local LLM 67 | - [Groq API](https://groq.com/) - fast inference for mutliple LLMs 68 | - [LangChain](https://www.langchain.com/) - build LLM-powered apps 69 | - [Qdrant](https://qdrant.tech/) - vector search/database 70 | - [FlashRank](https://github.com/PrithivirajDamodaran/FlashRank) - fast reranking 71 | - [FastEmbed](https://qdrant.github.io/fastembed/) - lightweight and fast embedding generation 72 | - [Streamlit](https://streamlit.io/) - build UI for data apps 73 | - [PDFium](https://pdfium.googlesource.com/pdfium/) - PDF processing and text extraction 74 | 75 | ## Add Groq API Key (Optional) 76 | 77 | You can also use the Groq API to replace the local LLM, for that you'll need a `.env` file with Groq API key: 78 | 79 | ```sh 80 | GROQ_API_KEY=YOUR API KEY 81 | ``` -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import random 3 | 4 | import streamlit as st 5 | from dotenv import load_dotenv 6 | 7 | from ragbase.chain import ask_question, create_chain 8 | from ragbase.config import Config 9 | from ragbase.ingestor import Ingestor 10 | from ragbase.model import create_llm 11 | from ragbase.retriever import create_retriever 12 | from ragbase.uploader import upload_files 13 | 14 | load_dotenv() 15 | 16 | LOADING_MESSAGES = [ 17 | "Calculating your answer through multiverse...", 18 | "Adjusting quantum entanglement...", 19 | "Summoning star wisdom... almost there!", 20 | "Consulting Schrödinger's cat...", 21 | "Warping spacetime for your response...", 22 | "Balancing neutron star equations...", 23 | "Analyzing dark matter... please wait...", 24 | "Engaging hyperdrive... en route!", 25 | "Gathering photons from a galaxy...", 26 | "Beaming data from Andromeda... stand by!", 27 | ] 28 | 29 | 30 | @st.cache_resource(show_spinner=False) 31 | def build_qa_chain(files): 32 | file_paths = upload_files(files) 33 | vector_store = Ingestor().ingest(file_paths) 34 | llm = create_llm() 35 | retriever = create_retriever(llm, vector_store=vector_store) 36 | return create_chain(llm, retriever) 37 | 38 | 39 | async def ask_chain(question: str, chain): 40 | full_response = "" 41 | assistant = st.chat_message( 42 | "assistant", avatar=str(Config.Path.IMAGES_DIR / "assistant-avatar.png") 43 | ) 44 | with assistant: 45 | message_placeholder = st.empty() 46 | message_placeholder.status(random.choice(LOADING_MESSAGES), state="running") 47 | documents = [] 48 | async for event in ask_question(chain, question, session_id="session-id-42"): 49 | if type(event) is str: 50 | full_response += event 51 | message_placeholder.markdown(full_response) 52 | if type(event) is list: 53 | documents.extend(event) 54 | for i, doc in enumerate(documents): 55 | with st.expander(f"Source #{i+1}"): 56 | st.write(doc.page_content) 57 | 58 | st.session_state.messages.append({"role": "assistant", "content": full_response}) 59 | 60 | 61 | def show_upload_documents(): 62 | holder = st.empty() 63 | with holder.container(): 64 | st.header("RagBase") 65 | st.subheader("Get answers from your documents") 66 | uploaded_files = st.file_uploader( 67 | label="Upload PDF files", type=["pdf"], accept_multiple_files=True 68 | ) 69 | if not uploaded_files: 70 | st.warning("Please upload PDF documents to continue!") 71 | st.stop() 72 | 73 | with st.spinner("Analyzing your document(s)..."): 74 | holder.empty() 75 | return build_qa_chain(uploaded_files) 76 | 77 | 78 | def show_message_history(): 79 | for message in st.session_state.messages: 80 | role = message["role"] 81 | avatar_path = ( 82 | Config.Path.IMAGES_DIR / "assistant-avatar.png" 83 | if role == "assistant" 84 | else Config.Path.IMAGES_DIR / "user-avatar.png" 85 | ) 86 | with st.chat_message(role, avatar=str(avatar_path)): 87 | st.markdown(message["content"]) 88 | 89 | 90 | def show_chat_input(chain): 91 | if prompt := st.chat_input("Ask your question here"): 92 | st.session_state.messages.append({"role": "user", "content": prompt}) 93 | with st.chat_message( 94 | "user", 95 | avatar=str(Config.Path.IMAGES_DIR / "user-avatar.png"), 96 | ): 97 | st.markdown(prompt) 98 | asyncio.run(ask_chain(prompt, chain)) 99 | 100 | 101 | st.set_page_config(page_title="RagBase", page_icon="🐧") 102 | 103 | st.html( 104 | """ 105 | 111 | """ 112 | ) 113 | 114 | if "messages" not in st.session_state: 115 | st.session_state.messages = [ 116 | { 117 | "role": "assistant", 118 | "content": "Hi! What do you want to know about your documents?", 119 | } 120 | ] 121 | 122 | if Config.CONVERSATION_MESSAGES_LIMIT > 0 and Config.CONVERSATION_MESSAGES_LIMIT <= len( 123 | st.session_state.messages 124 | ): 125 | st.warning( 126 | "You have reached the conversation limit. Refresh the page to start a new conversation." 127 | ) 128 | st.stop() 129 | 130 | chain = show_upload_documents() 131 | show_message_history() 132 | show_chat_input(chain) 133 | -------------------------------------------------------------------------------- /images/assistant-avatar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/curiousily/ragbase/5af7b79162329fa0b725b39d1aeeea7eb1e4428b/images/assistant-avatar.png -------------------------------------------------------------------------------- /images/user-avatar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/curiousily/ragbase/5af7b79162329fa0b725b39d1aeeea7eb1e4428b/images/user-avatar.png -------------------------------------------------------------------------------- /notebooks/reranking-and-filtering.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from langchain_core.documents import Document\n", 10 | "from ragbase.model import create_reranker, create_llm\n", 11 | "from langchain.retrievers.document_compressors.chain_filter import LLMChainFilter\n", 12 | "from langchain.globals import set_verbose\n", 13 | "\n", 14 | "set_verbose(True)" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "reranker = create_reranker()" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 3, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "documents = [\n", 33 | " Document(\"\"\"Not since Porsche blew air cooling out the back door in 1999 with the 996-\n", 34 | "generation 911 has there been a bigger change that will upset more Porsche fans\n", 35 | "than the hybridization of the 911. Fans, however, are not necessarily owners, and\n", 36 | "those with the means will not mind the 21st-century tech one bit, because if\n", 37 | "Porsche didn't tell anyone this updated 992.2 was a hybrid, no one would know by\n", 38 | "how it drives.\n", 39 | "\"\"\"),\n", 40 | " Document(\"\"\"For now, the new 3.6-liter flat-six operating in perfect stoichiometry all the time,\n", 41 | "its electrified turbocharger, and eight-speed dual-clutch gearbox with an integral\n", 42 | "electric motor are limited to the GTS, which used to be powered by a jazzed-up\n", 43 | "version of the twin-turbo 3.0-liter in the base Carrera.\n", 44 | "\"\"\"),\n", 45 | " Document(\"\"\"\n", 46 | "Anyway, the screen is cheaper, but at least Porsche does some cool stuff with it.\n", 47 | "There are seven different views, including a classic five-gauge cluster, but the most\n", 48 | "interesting of them is a track-focused mode that clocks the tach so that the redline\n", 49 | "is near 12 o'clock. Porsche would have scored more points if it had kept the \n", 50 | "orientation of the tach numbers so that they locked with the twist, like a real\n", 51 | "clocked tach out of a 917.\n", 52 | "\"\"\"),\n", 53 | " Document(\"\"\"The bad news is that the GTS coupe starts at $166,895. The back seat is no longer\n", 54 | "standard, though adding the +2 second row is a no-cost option. That's mega\n", 55 | "money, but until Porsche sees sales drop, you better believe it will keep raising its\n", 56 | "prices. That's just basic supply-demand, and plenty of people will line up for this\n", 57 | "water-cooled hybrid.\n", 58 | "\"\"\")\n", 59 | "]" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "## Reranker" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 4, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "reranked_documents = reranker.compress_documents(documents, \"What is the price?\")" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 5, 81 | "metadata": {}, 82 | "outputs": [ 83 | { 84 | "name": "stdout", 85 | "output_type": "stream", 86 | "text": [ 87 | "dict_keys(['id', 'metadata', 'page_content', 'type'])\n", 88 | "score: 0.12178549\n", 89 | "The bad news is that the GTS coupe starts at $166,895. The back seat is no longer\n", 90 | "standard, though adding the +2 second row is a no-cost option. That's mega\n", 91 | "money, but until Porsche sees sales drop, you better believe it will keep raising its\n", 92 | "prices. That's just basic supply-demand, and plenty of people will line up for this\n", 93 | "water-cooled hybrid.\n", 94 | "\n" 95 | ] 96 | } 97 | ], 98 | "source": [ 99 | "first_result = reranked_documents[0]\n", 100 | "print(first_result.__dict__.keys())\n", 101 | "print(\"score:\", first_result.metadata[\"relevance_score\"])\n", 102 | "print(first_result.page_content)" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": {}, 108 | "source": [ 109 | "## LLM Chain Filter" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 6, 115 | "metadata": {}, 116 | "outputs": [ 117 | { 118 | "name": "stdout", 119 | "output_type": "stream", 120 | "text": [ 121 | "\n", 122 | "\n", 123 | "\u001b[1m> Entering new LLMChain chain...\u001b[0m\n", 124 | "\n", 125 | "\u001b[1m> Entering new LLMChain chain...\u001b[0m\n", 126 | "Prompt after formatting:\n", 127 | "\u001b[32;1m\u001b[1;3mGiven the following question and context, return YES if the context is relevant to the question and NO if it isn't.\n", 128 | "\n", 129 | "> Question: How many seats in the car?\n", 130 | "> Context:\n", 131 | ">>>\n", 132 | "For now, the new 3.6-liter flat-six operating in perfect stoichiometry all the time,\n", 133 | "its electrified turbocharger, and eight-speed dual-clutch gearbox with an integral\n", 134 | "electric motor are limited to the GTS, which used to be powered by a jazzed-up\n", 135 | "version of the twin-turbo 3.0-liter in the base Carrera.\n", 136 | "\n", 137 | ">>>\n", 138 | "> Relevant (YES / NO):\u001b[0m\n", 139 | "\n", 140 | "Prompt after formatting:\n", 141 | "\u001b[32;1m\u001b[1;3mGiven the following question and context, return YES if the context is relevant to the question and NO if it isn't.\n", 142 | "\n", 143 | "> Question: How many seats in the car?\n", 144 | "> Context:\n", 145 | ">>>\n", 146 | "Not since Porsche blew air cooling out the back door in 1999 with the 996-\n", 147 | "generation 911 has there been a bigger change that will upset more Porsche fans\n", 148 | "than the hybridization of the 911. Fans, however, are not necessarily owners, and\n", 149 | "those with the means will not mind the 21st-century tech one bit, because if\n", 150 | "Porsche didn't tell anyone this updated 992.2 was a hybrid, no one would know by\n", 151 | "how it drives.\n", 152 | "\n", 153 | ">>>\n", 154 | "> Relevant (YES / NO):\u001b[0m\n", 155 | "\n", 156 | "\n", 157 | "\u001b[1m> Entering new LLMChain chain...\u001b[0m\n", 158 | "Prompt after formatting:\n", 159 | "\u001b[32;1m\u001b[1;3mGiven the following question and context, return YES if the context is relevant to the question and NO if it isn't.\n", 160 | "\n", 161 | "> Question: How many seats in the car?\n", 162 | "> Context:\n", 163 | ">>>\n", 164 | "\n", 165 | "Anyway, the screen is cheaper, but at least Porsche does some cool stuff with it.\n", 166 | "There are seven different views, including a classic five-gauge cluster, but the most\n", 167 | "interesting of them is a track-focused mode that clocks the tach so that the redline\n", 168 | "is near 12 o'clock. Porsche would have scored more points if it had kept the \n", 169 | "orientation of the tach numbers so that they locked with the twist, like a real\n", 170 | "clocked tach out of a 917.\n", 171 | "\n", 172 | ">>>\n", 173 | "> Relevant (YES / NO):\u001b[0m\n", 174 | "\n", 175 | "\n", 176 | "\u001b[1m> Entering new LLMChain chain...\u001b[0m\n", 177 | "Prompt after formatting:\n", 178 | "\u001b[32;1m\u001b[1;3mGiven the following question and context, return YES if the context is relevant to the question and NO if it isn't.\n", 179 | "\n", 180 | "> Question: How many seats in the car?\n", 181 | "> Context:\n", 182 | ">>>\n", 183 | "The bad news is that the GTS coupe starts at $166,895. The back seat is no longer\n", 184 | "standard, though adding the +2 second row is a no-cost option. That's mega\n", 185 | "money, but until Porsche sees sales drop, you better believe it will keep raising its\n", 186 | "prices. That's just basic supply-demand, and plenty of people will line up for this\n", 187 | "water-cooled hybrid.\n", 188 | "\n", 189 | ">>>\n", 190 | "> Relevant (YES / NO):\u001b[0m\n" 191 | ] 192 | }, 193 | { 194 | "name": "stderr", 195 | "output_type": "stream", 196 | "text": [ 197 | "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", 198 | "To disable this warning, you can either:\n", 199 | "\t- Avoid using `tokenizers` before the fork if possible\n", 200 | "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", 201 | "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", 202 | "To disable this warning, you can either:\n", 203 | "\t- Avoid using `tokenizers` before the fork if possible\n", 204 | "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", 205 | "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", 206 | "To disable this warning, you can either:\n", 207 | "\t- Avoid using `tokenizers` before the fork if possible\n", 208 | "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", 209 | "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", 210 | "To disable this warning, you can either:\n", 211 | "\t- Avoid using `tokenizers` before the fork if possible\n", 212 | "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n", 213 | "INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions \"HTTP/1.1 200 OK\"\n", 214 | "INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions \"HTTP/1.1 200 OK\"\n", 215 | "INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions \"HTTP/1.1 200 OK\"\n" 216 | ] 217 | }, 218 | { 219 | "name": "stdout", 220 | "output_type": "stream", 221 | "text": [ 222 | "\n", 223 | "\u001b[1m> Finished chain.\u001b[0m\n", 224 | "\n", 225 | "\u001b[1m> Finished chain.\u001b[0m\n", 226 | "\n", 227 | "\u001b[1m> Finished chain.\u001b[0m\n" 228 | ] 229 | }, 230 | { 231 | "name": "stderr", 232 | "output_type": "stream", 233 | "text": [ 234 | "INFO:httpx:HTTP Request: POST https://api.groq.com/openai/v1/chat/completions \"HTTP/1.1 200 OK\"\n" 235 | ] 236 | }, 237 | { 238 | "name": "stdout", 239 | "output_type": "stream", 240 | "text": [ 241 | "\n", 242 | "\u001b[1m> Finished chain.\u001b[0m\n" 243 | ] 244 | } 245 | ], 246 | "source": [ 247 | "chain_filter = LLMChainFilter.from_llm(create_llm())\n", 248 | "filtered_documents = chain_filter.compress_documents(documents, \"How many seats in the car?\")" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": 7, 254 | "metadata": {}, 255 | "outputs": [ 256 | { 257 | "name": "stdout", 258 | "output_type": "stream", 259 | "text": [ 260 | "1\n", 261 | "The bad news is that the GTS coupe starts at $166,895. The back seat is no longer\n", 262 | "standard, though adding the +2 second row is a no-cost option. That's mega\n", 263 | "money, but until Porsche sees sales drop, you better believe it will keep raising its\n", 264 | "prices. That's just basic supply-demand, and plenty of people will line up for this\n", 265 | "water-cooled hybrid.\n", 266 | "\n" 267 | ] 268 | } 269 | ], 270 | "source": [ 271 | "print(len(filtered_documents))\n", 272 | "print(filtered_documents[0].page_content)" 273 | ] 274 | } 275 | ], 276 | "metadata": { 277 | "kernelspec": { 278 | "display_name": "ragbase-YTjELISx-py3.12", 279 | "language": "python", 280 | "name": "python3" 281 | }, 282 | "language_info": { 283 | "codemirror_mode": { 284 | "name": "ipython", 285 | "version": 3 286 | }, 287 | "file_extension": ".py", 288 | "mimetype": "text/x-python", 289 | "name": "python", 290 | "nbconvert_exporter": "python", 291 | "pygments_lexer": "ipython3", 292 | "version": "3.12.4" 293 | } 294 | }, 295 | "nbformat": 4, 296 | "nbformat_minor": 2 297 | } 298 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "ragbase" 3 | version = "0.1.0" 4 | description = "" 5 | authors = ["Venelin Valkov "] 6 | license = "MIT" 7 | readme = "README.md" 8 | 9 | [tool.poetry.dependencies] 10 | python = ">=3.12,<3.13" 11 | langchain-community = "^0.2.6" 12 | flashrank = "^0.2.8" 13 | qdrant-client = "^1.10.1" 14 | pypdfium2 = "^4.30.0" 15 | fastembed = "^0.3.3" 16 | langchain-experimental = "^0.0.62" 17 | langchain-qdrant = "^0.1.1" 18 | streamlit = "^1.36.0" 19 | python-dotenv = "^1.0.1" 20 | langchain-groq = "^0.1.6" 21 | 22 | 23 | [tool.poetry.group.dev.dependencies] 24 | ruff = "^0.5.1" 25 | jupyterlab = "^4.2.3" 26 | watchdog = "^4.0.1" 27 | 28 | [build-system] 29 | requires = ["poetry-core"] 30 | build-backend = "poetry.core.masonry.api" 31 | -------------------------------------------------------------------------------- /ragbase/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/curiousily/ragbase/5af7b79162329fa0b725b39d1aeeea7eb1e4428b/ragbase/__init__.py -------------------------------------------------------------------------------- /ragbase/chain.py: -------------------------------------------------------------------------------- 1 | import re 2 | from operator import itemgetter 3 | from typing import List 4 | 5 | from langchain.schema.runnable import RunnablePassthrough 6 | from langchain_core.documents import Document 7 | from langchain_core.language_models import BaseLanguageModel 8 | from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder 9 | from langchain_core.runnables import Runnable 10 | from langchain_core.runnables.history import RunnableWithMessageHistory 11 | from langchain_core.tracers.stdout import ConsoleCallbackHandler 12 | from langchain_core.vectorstores import VectorStoreRetriever 13 | 14 | from ragbase.config import Config 15 | from ragbase.session_history import get_session_history 16 | 17 | SYSTEM_PROMPT = """ 18 | Utilize the provided contextual information to respond to the user question. 19 | If the answer is not found within the context, state that the answer cannot be found. 20 | Prioritize concise responses (maximum of 3 sentences) and use a list where applicable. 21 | The contextual information is organized with the most relevant source appearing first. 22 | Each source is separated by a horizontal rule (---). 23 | 24 | Context: 25 | {context} 26 | 27 | Use markdown formatting where appropriate. 28 | """ 29 | 30 | 31 | def remove_links(text: str) -> str: 32 | url_pattern = r"https?://\S+|www\.\S+" 33 | return re.sub(url_pattern, "", text) 34 | 35 | 36 | def format_documents(documents: List[Document]) -> str: 37 | texts = [] 38 | for doc in documents: 39 | texts.append(doc.page_content) 40 | texts.append("---") 41 | 42 | return remove_links("\n".join(texts)) 43 | 44 | 45 | def create_chain(llm: BaseLanguageModel, retriever: VectorStoreRetriever) -> Runnable: 46 | prompt = ChatPromptTemplate.from_messages( 47 | [ 48 | ("system", SYSTEM_PROMPT), 49 | MessagesPlaceholder("chat_history"), 50 | ("human", "{question}"), 51 | ] 52 | ) 53 | 54 | chain = ( 55 | RunnablePassthrough.assign( 56 | context=itemgetter("question") 57 | | retriever.with_config({"run_name": "context_retriever"}) 58 | | format_documents 59 | ) 60 | | prompt 61 | | llm 62 | ) 63 | 64 | return RunnableWithMessageHistory( 65 | chain, 66 | get_session_history, 67 | input_messages_key="question", 68 | history_messages_key="chat_history", 69 | ).with_config({"run_name": "chain_answer"}) 70 | 71 | 72 | async def ask_question(chain: Runnable, question: str, session_id: str): 73 | async for event in chain.astream_events( 74 | {"question": question}, 75 | config={ 76 | "callbacks": [ConsoleCallbackHandler()] if Config.DEBUG else [], 77 | "configurable": {"session_id": session_id}, 78 | }, 79 | version="v2", 80 | include_names=["context_retriever", "chain_answer"], 81 | ): 82 | event_type = event["event"] 83 | if event_type == "on_retriever_end": 84 | yield event["data"]["output"] 85 | if event_type == "on_chain_stream": 86 | yield event["data"]["chunk"].content 87 | -------------------------------------------------------------------------------- /ragbase/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | 4 | 5 | class Config: 6 | class Path: 7 | APP_HOME = Path(os.getenv("APP_HOME", Path(__file__).parent.parent)) 8 | DATABASE_DIR = APP_HOME / "docs-db" 9 | DOCUMENTS_DIR = APP_HOME / "tmp" 10 | IMAGES_DIR = APP_HOME / "images" 11 | 12 | class Database: 13 | DOCUMENTS_COLLECTION = "documents" 14 | 15 | class Model: 16 | EMBEDDINGS = "BAAI/bge-base-en-v1.5" 17 | RERANKER = "ms-marco-MiniLM-L-12-v2" 18 | LOCAL_LLM = "gemma2:9b" 19 | REMOTE_LLM = "llama-3.1-70b-versatile" 20 | TEMPERATURE = 0.0 21 | MAX_TOKENS = 8000 22 | USE_LOCAL = False 23 | 24 | class Retriever: 25 | USE_RERANKER = True 26 | USE_CHAIN_FILTER = False 27 | 28 | DEBUG = False 29 | CONVERSATION_MESSAGES_LIMIT = 6 30 | -------------------------------------------------------------------------------- /ragbase/ingestor.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import List 3 | 4 | from langchain_community.document_loaders import PyPDFium2Loader 5 | from langchain_community.embeddings.fastembed import FastEmbedEmbeddings 6 | from langchain_core.vectorstores import VectorStore 7 | from langchain_experimental.text_splitter import SemanticChunker 8 | from langchain_qdrant import Qdrant 9 | from langchain_text_splitters import RecursiveCharacterTextSplitter 10 | 11 | from ragbase.config import Config 12 | 13 | 14 | class Ingestor: 15 | def __init__(self): 16 | self.embeddings = FastEmbedEmbeddings(model_name=Config.Model.EMBEDDINGS) 17 | self.semantic_splitter = SemanticChunker( 18 | self.embeddings, breakpoint_threshold_type="interquartile" 19 | ) 20 | self.recursive_splitter = RecursiveCharacterTextSplitter( 21 | chunk_size=2048, 22 | chunk_overlap=128, 23 | add_start_index=True, 24 | ) 25 | 26 | def ingest(self, doc_paths: List[Path]) -> VectorStore: 27 | documents = [] 28 | for doc_path in doc_paths: 29 | loaded_documents = PyPDFium2Loader(doc_path).load() 30 | document_text = "\n".join([doc.page_content for doc in loaded_documents]) 31 | documents.extend( 32 | self.recursive_splitter.split_documents( 33 | self.semantic_splitter.create_documents([document_text]) 34 | ) 35 | ) 36 | return Qdrant.from_documents( 37 | documents=documents, 38 | embedding=self.embeddings, 39 | path=Config.Path.DATABASE_DIR, 40 | collection_name=Config.Database.DOCUMENTS_COLLECTION, 41 | ) 42 | -------------------------------------------------------------------------------- /ragbase/model.py: -------------------------------------------------------------------------------- 1 | from langchain_community.chat_models import ChatOllama 2 | from langchain_community.document_compressors.flashrank_rerank import FlashrankRerank 3 | from langchain_community.embeddings.fastembed import FastEmbedEmbeddings 4 | from langchain_core.language_models import BaseLanguageModel 5 | from langchain_groq import ChatGroq 6 | 7 | from ragbase.config import Config 8 | 9 | 10 | def create_llm() -> BaseLanguageModel: 11 | if Config.Model.USE_LOCAL: 12 | return ChatOllama( 13 | model=Config.Model.LOCAL_LLM, 14 | temperature=Config.Model.TEMPERATURE, 15 | keep_alive="1h", 16 | max_tokens=Config.Model.MAX_TOKENS, 17 | ) 18 | else: 19 | return ChatGroq( 20 | temperature=Config.Model.TEMPERATURE, 21 | model_name=Config.Model.REMOTE_LLM, 22 | max_tokens=Config.Model.MAX_TOKENS, 23 | ) 24 | 25 | 26 | def create_embeddings() -> FastEmbedEmbeddings: 27 | return FastEmbedEmbeddings(model_name=Config.Model.EMBEDDINGS) 28 | 29 | 30 | def create_reranker() -> FlashrankRerank: 31 | return FlashrankRerank(model=Config.Model.RERANKER) 32 | -------------------------------------------------------------------------------- /ragbase/retriever.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from langchain.retrievers import ContextualCompressionRetriever 4 | from langchain.retrievers.document_compressors.chain_filter import LLMChainFilter 5 | from langchain_core.language_models import BaseLanguageModel 6 | from langchain_core.vectorstores import VectorStore, VectorStoreRetriever 7 | from langchain_qdrant import Qdrant 8 | 9 | from ragbase.config import Config 10 | from ragbase.model import create_embeddings, create_reranker 11 | 12 | 13 | def create_retriever( 14 | llm: BaseLanguageModel, vector_store: Optional[VectorStore] = None 15 | ) -> VectorStoreRetriever: 16 | if not vector_store: 17 | vector_store = Qdrant.from_existing_collection( 18 | embedding=create_embeddings(), 19 | collection_name=Config.Database.DOCUMENTS_COLLECTION, 20 | path=Config.Path.DATABASE_DIR, 21 | ) 22 | 23 | retriever = vector_store.as_retriever( 24 | search_type="similarity", search_kwargs={"k": 5} 25 | ) 26 | 27 | if Config.Retriever.USE_RERANKER: 28 | retriever = ContextualCompressionRetriever( 29 | base_compressor=create_reranker(), base_retriever=retriever 30 | ) 31 | 32 | if Config.Retriever.USE_CHAIN_FILTER: 33 | retriever = ContextualCompressionRetriever( 34 | base_compressor=LLMChainFilter.from_llm(llm), base_retriever=retriever 35 | ) 36 | 37 | return retriever 38 | -------------------------------------------------------------------------------- /ragbase/session_history.py: -------------------------------------------------------------------------------- 1 | from langchain_community.chat_message_histories import ChatMessageHistory 2 | 3 | store = {} 4 | 5 | 6 | def get_session_history(session_id: str) -> ChatMessageHistory: 7 | if session_id not in store: 8 | store[session_id] = ChatMessageHistory() 9 | return store[session_id] 10 | -------------------------------------------------------------------------------- /ragbase/uploader.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | from pathlib import Path 3 | from typing import List 4 | 5 | from streamlit.runtime.uploaded_file_manager import UploadedFile 6 | 7 | from ragbase.config import Config 8 | 9 | 10 | def upload_files( 11 | files: List[UploadedFile], remove_old_files: bool = True 12 | ) -> List[Path]: 13 | if remove_old_files: 14 | shutil.rmtree(Config.Path.DATABASE_DIR, ignore_errors=True) 15 | shutil.rmtree(Config.Path.DOCUMENTS_DIR, ignore_errors=True) 16 | Config.Path.DOCUMENTS_DIR.mkdir(parents=True, exist_ok=True) 17 | file_paths = [] 18 | for file in files: 19 | file_path = Config.Path.DOCUMENTS_DIR / file.name 20 | with file_path.open("wb") as f: 21 | f.write(file.getvalue()) 22 | file_paths.append(file_path) 23 | return file_paths 24 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | fastembed==0.3.3 2 | flashrank==0.2.8 3 | langchain-community==0.2.6 4 | langchain-experimental==0.0.62 5 | langchain-groq==0.1.6 6 | langchain-qdrant==0.1.1 7 | langchain==0.2.6 8 | python-dotenv==1.0.1 9 | qdrant-client==1.10.1 10 | streamlit==1.36.0 11 | pypdfium2==4.30.0 --------------------------------------------------------------------------------