├── ragnarok ├── img │ └── ragnarok.png ├── requirements.in ├── requirements.txt.orig ├── ollama_test.py ├── .vscode │ └── launch.json ├── pages │ ├── 2_Free_Chat.py │ └── 1_RAGnarok_Chat.py ├── requirements.txt └── RAGnarok_Settings.py ├── launch.sh ├── LICENSE ├── .gitignore └── README.md /ragnarok/img/ragnarok.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GhostPack/RAGnarok/HEAD/ragnarok/img/ragnarok.png -------------------------------------------------------------------------------- /ragnarok/requirements.in: -------------------------------------------------------------------------------- 1 | transformers 2 | sentence-transformers 3 | huggingface-hub 4 | langchain 5 | langchain-community 6 | llama-cpp-python 7 | streamlit 8 | streamlit-extras 9 | streamlit-cookies-manager 10 | ollama -------------------------------------------------------------------------------- /ragnarok/requirements.txt.orig: -------------------------------------------------------------------------------- 1 | transformers==4.37.0 2 | sentence-transformers==2.2.2 3 | huggingface-hub==0.20.3 4 | langchain==0.1.2 5 | langchain-community==0.0.14 6 | llama-cpp-python==0.2.26 7 | streamlit==1.30.0 8 | streamlit-extras==0.3.6 9 | streamlit-cookies-manager==0.2.0 -------------------------------------------------------------------------------- /ragnarok/ollama_test.py: -------------------------------------------------------------------------------- 1 | from ollama import Client 2 | client = Client( 3 | host='http://10.0.0.11:11434', 4 | headers={} 5 | ) 6 | response = client.chat(model='llama3.2', messages=[ 7 | { 8 | 'role': 'user', 9 | 'content': 'Why is the sky blue?', 10 | }, 11 | ]) -------------------------------------------------------------------------------- /launch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # create the Python virtual env and install the requirements 4 | cd ragnarok 5 | sudo apt install python3.11-venv 6 | python3 -m venv venv 7 | source venv/bin/activate 8 | pip3 install -r requirements.txt 9 | 10 | # kick off the main app 11 | streamlit run RAGnarok_Settings.py 12 | -------------------------------------------------------------------------------- /ragnarok/.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | // Use IntelliSense to learn about possible attributes. 3 | // Hover to view descriptions of existing attributes. 4 | // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 5 | "version": "0.2.0", 6 | "configurations": [ 7 | 8 | { 9 | "name": "Python: Streamlit", 10 | "type": "debugpy", 11 | "request": "launch", 12 | "module": "streamlit", 13 | "args": [ 14 | "run", 15 | "RAGnarok_Settings.py", 16 | "--server.runOnSave=true" 17 | ], 18 | "jinja": true 19 | } 20 | ] 21 | } -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2024, GhostPack 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | 3. Neither the name of the copyright holder nor the names of its 16 | contributors may be used to endorse or promote products derived from 17 | this software without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 23 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 27 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | 162 | .DS_STORE 163 | 164 | ssh.sh -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RAGnarok 2 | 3 | RAGnarok is a [Retrieval-Augmented Generation](https://blogs.nvidia.com/blog/what-is-retrieval-augmented-generation/) chatbot frontend for [Nemesis](https://github.com/SpecterOps/Nemesis). It allows you to ask questions about text extracted from compatible documents processed by Nemesis. 4 | 5 | ## RAG 6 | 7 | **Short explanation:** The general idea with Retrieval-Augmented Generation (RAG) is to allow a large language model (LLM) to answer questions about documents you've indexed. 8 | 9 | **Medium explanation:** RAG involves processing and turning text inputs into set-length vectors via an embedding model, which are then stored in a backend vector database. Questions to the LLM are then used to look up the "most similiar" chunks of text which are then fed into the context prompt for a LLM. 10 | 11 | ![](https://miro.medium.com/v2/resize:fit:1400/format:webp/1*kSkeaXRvRzbJ9SrFZaMoOg.png) 12 | [*Source*](https://towardsdatascience.com/retrieval-augmented-generation-rag-from-theory-to-langchain-implementation-4e9bd5f6a4f2) 13 | 14 | ***Longer explanation in the rest of the section :)*** 15 | 16 | ***Even Longer explanation in [this blog post](https://posts.specterops.io/summoning-ragnarok-with-your-nemesis-7c4f0577c93b).*** 17 | 18 | #### Indexing 19 | 20 | Retrieval-augumented generation is an architecture where documents being processed undergo the following process: 21 | 22 | 1. Plaintext is extracted from any incoming documents. 23 | - Nemesis uses [Apache Tika](https://tika.apache.org/) to extract text from compatible documents. 24 | 2. The text is tokenized into chunks of up to X tokens, where X depends on the *context window* of the embedding model used. 25 | - Nemesis uses Langchain's [TokenTextSplitter](https://api.python.langchain.com/en/latest/text_splitter/langchain.text_splitter.TokenTextSplitter.html), a chunk size of 510 tokens, and a 15% overlap between chunks. 26 | 3. Each chunk of text is processed by an [embedding model](https://huggingface.co/spaces/mteb/leaderboard) which turns the input text into a fixed-length vector of floats. 27 | - As [Pinecone explains](https://www.pinecone.io/learn/vector-embeddings/), what's cool about embedding models is that the vector representations they produce preserve "semantic similiarity", meaning that more similiar chunks of text will have more similiar vectors. 28 | - Nemesis currently uses the [TaylorAI/gte-tiny](https://huggingface.co/TaylorAI/gte-tiny) embedding model as it's fast, but others are possible. 29 | 4. Each vector and associated snippet of text is stored in a vector database. 30 | - Nemesis uses Elasticsearch for vector storage. 31 | 32 | #### Semantic Search 33 | 34 | This is the initial indexing process that Nemesis has been performing for a while. However, in order to complete a RAG-pipeline, the next steps are: 35 | 36 | 5. Take an input prompt, such as "*What is a certificate?*" and run it through the same embedding model files were indexed with. 37 | 6. Query the vector database (e.g., Elasticsearch) for the nearest **k** vectors + associated text chunks that are "closest" to the prompt input vector. 38 | - This will return the **k** chunks of text that are the most similiar to the input query. 39 | 7. We also use Elasticsearch's traditional(-ish) BM25 text search over the text for each chunk. 40 | - These two lists of results are combined with [Reciprocal Rank Fusion](https://learn.microsoft.com/en-us/azure/search/hybrid-search-ranking), and the top results from the fused list are returned. 41 | - **Note:** steps 6 and 7 happen in the `nlp` container in Nemesis. This is exposed at http://\/nlp/hybrid_search 42 | 43 | #### Reranking 44 | 45 | We now have the **k** most chunks of text most simliar to our input query. If we want to get a bit facier, we can execute what's called [reranking](https://www.pinecone.io/learn/series/rag/rerankers/). 46 | 47 | 7. With reranking, the the prompt question and text results are paired up (question, text) and fed into a more powerful model (well, more powerful than the embedding model) tuned and known as a reranker. The reranker generates a simliarity score of the input prompt and text chunk. 48 | - RAGnarok uses an adapted version of [BAAI/bge-reranker-base](https://huggingface.co/BAAI/bge-reranker-base) for reranking. 49 | 8. The results are then **reranked** and the top X number of results are selected. 50 | 51 | #### LLM Processing 52 | 53 | 9. Finally, the resulting texts are combined with a prompt to the (local) LLM. Think something along the lines of "Given these chunks of text {X}, answer this question {Y}". 54 | -------------------------------------------------------------------------------- /ragnarok/pages/2_Free_Chat.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | # from transformers import AutoModelForSequenceClassification, AutoTokenizer 3 | from huggingface_hub import hf_hub_download 4 | from llama_cpp import Llama 5 | from streamlit_cookies_manager import CookieManager 6 | import time 7 | import torch 8 | import asyncio 9 | from ollama import AsyncClient 10 | 11 | 12 | cookies = CookieManager() 13 | while not cookies.ready(): 14 | time.sleep(1) 15 | 16 | if "mode" not in cookies: 17 | st.error("Your cookies are broken, please go back to the main settings page.") 18 | st.stop() 19 | 20 | if "mode_index" not in st.session_state: 21 | st.session_state["mode_index"] = 0 # Default to the first mode 22 | 23 | def get_mode_index(cookies): 24 | if "mode" in cookies: 25 | if cookies["mode"] == "Local LLM": 26 | return 0 27 | elif cookies["mode"] == "Remote Ollama Server (Local Reranker)": 28 | return 1 29 | elif cookies["mode"] == "Remote Ollama Server (No Reranker)": 30 | return 2 31 | return 0 # Default to 0 if no mode is set 32 | 33 | @st.cache_resource 34 | def get_ollama_client(ollama_url): 35 | client = AsyncClient(ollama_url) 36 | return client 37 | 38 | mode_index = get_mode_index(cookies) 39 | 40 | if mode_index == 0: 41 | if "llm_model" not in cookies and "ollama_model" not in cookies: 42 | st.error("Please select a LLM model on the main settings page.") 43 | st.stop() 44 | if "llm_temperature" not in cookies: 45 | st.error("Please select a LLM model temperature on the main settings page.") 46 | st.stop() 47 | 48 | if mode_index == 1 or mode_index == 2: 49 | if "ollama_url" not in cookies: 50 | st.error("Please select an Ollama server on the main settings page.") 51 | st.stop() 52 | if "ollama_model" not in cookies: 53 | st.error("Please select an Ollama model on the main settings page.") 54 | st.stop() 55 | 56 | if mode_index == 1 or mode_index == 2: 57 | if "ollama_url" not in cookies: 58 | st.error("Please select an Ollama server on the main settings page.") 59 | st.stop() 60 | if "ollama_model" not in cookies: 61 | st.error("Please select an Ollama model on the main settings page.") 62 | st.stop() 63 | if cookies["ollama_model"] == "": 64 | st.error("Please select an Ollama model on the main settings page.") 65 | st.stop() 66 | 67 | 68 | if mode_index == 0: 69 | llm_generation_kwargs = { 70 | "max_tokens": 512, 71 | "stream": True, 72 | "temperature": float(cookies["llm_temperature"]), 73 | "echo": False 74 | } 75 | if mode_index == 0 or mode_index == 1: 76 | # check for GPU presence 77 | if torch.cuda.is_available(): 78 | # traditional Nvidia cuda GPUs 79 | device = torch.device("cuda:0") 80 | n_gpu_layers = int(cookies["n_gpu_layers"]) 81 | elif torch.backends.mps.is_available(): 82 | # for macOS M1/M2s 83 | device = torch.device("mps") 84 | n_gpu_layers = int(cookies["n_gpu_layers"]) 85 | else: 86 | device = torch.device("cpu") 87 | n_gpu_layers = 0 88 | 89 | @st.cache_resource 90 | def get_llm(llm_model_path, n_gpu_layers): 91 | llm = Llama( 92 | model_path=llm_model_path, 93 | n_ctx=8192, 94 | n_gpu_layers=n_gpu_layers, 95 | verbose=False 96 | ) 97 | return llm 98 | if mode_index == 0: 99 | try: 100 | if cookies["llm_model"] == "Intel/neural-chat-7b-v3-3": 101 | llm_model_path = hf_hub_download("TheBloke/neural-chat-7B-v3-3-GGUF", filename="neural-chat-7b-v3-3.Q5_K_M.gguf", local_files_only=True) 102 | elif cookies["llm_model"] == "openchat-3.5-0106": 103 | llm_model_path = hf_hub_download("TheBloke/openchat-3.5-0106-GGUF", filename="openchat-3.5-0106.Q5_K_M.gguf", local_files_only=True) 104 | elif cookies["llm_model"] == "Starling-LM-7B-alpha": 105 | llm_model_path = hf_hub_download("TheBloke/Starling-LM-7B-alpha-GGUF", filename="starling-lm-7b-alpha.Q5_K_M.gguf", local_files_only=True) 106 | else: 107 | llm_model = cookies["llm_model"] 108 | st.error(f"Invalid llm_model: {llm_model}") 109 | except: 110 | with st.spinner("Downloading LLM model (this will take some time)..."): 111 | if cookies["llm_model"] == "Intel/neural-chat-7b-v3-3": 112 | llm_model_path = hf_hub_download("TheBloke/neural-chat-7B-v3-3-GGUF", filename="neural-chat-7b-v3-3.Q5_K_M.gguf") 113 | elif cookies["llm_model"] == "openchat-3.5-0106": 114 | llm_model_path = hf_hub_download("TheBloke/openchat-3.5-0106-GGUF", filename="openchat-3.5-0106.Q5_K_M.gguf") 115 | elif cookies["llm_model"] == "Starling-LM-7B-alpha": 116 | llm_model_path = hf_hub_download("TheBloke/Starling-LM-7B-alpha-GGUF", filename="starling-lm-7b-alpha.Q5_K_M.gguf") 117 | else: 118 | llm_model = cookies["llm_model"] 119 | st.error(f"Invalid llm_model_path: {llm_model}") 120 | 121 | llm = get_llm(llm_model_path, n_gpu_layers) 122 | 123 | st.title("Free Chat With Selected Model") 124 | st.warning('*WARNING: results not guaranteed to be correct!*', icon="⚠️") 125 | 126 | if "freeform_messages" not in st.session_state: 127 | st.session_state.freeform_messages = [] 128 | 129 | for message in st.session_state.freeform_messages: 130 | with st.chat_message(message["role"]): 131 | st.markdown(message["content"]) 132 | 133 | if prompt := st.chat_input(""): 134 | st.session_state.freeform_messages.append({"role": "user", "content": prompt}) 135 | with st.chat_message("user"): 136 | st.markdown(prompt) 137 | 138 | with st.chat_message("assistant"): 139 | message_placeholder = st.empty() 140 | full_response = "" 141 | 142 | if "neural-chat" in cookies["llm_model"]: 143 | single_turn_prompt = f"### System:\nYou are a helpful assistant chatbot.\n### User:\n{prompt}\n### Assistant:\n" 144 | else: 145 | single_turn_prompt = f"GPT4 Correct User: {prompt}<|end_of_turn|>GPT4 Correct Assistant:" 146 | 147 | with st.spinner("LLM is processing the prompt..."): 148 | start = time.time() 149 | if mode_index == 0: 150 | stream = llm.create_completion(single_turn_prompt, **llm_generation_kwargs) 151 | for output in stream: 152 | full_response += (output['choices'][0]['text'] or "").split("### Assistant:\n")[-1] 153 | message_placeholder.markdown(full_response + "▌") 154 | elif mode_index == 1 or mode_index == 2: 155 | client = get_ollama_client(cookies["ollama_url"]) 156 | async def chat(): 157 | full_response = "" 158 | # TODO: remove hardcoded model 159 | message = {'role': 'user', 'content': single_turn_prompt} 160 | async for part in await AsyncClient().chat(model='dolphin3:8b', messages=[message], stream=True): 161 | print(part['message']['content'], end='', flush=True) 162 | full_response += part['message']['content'] 163 | message_placeholder.markdown(full_response + "▌") 164 | return full_response 165 | 166 | full_response = asyncio.run(chat()) 167 | 168 | 169 | end = time.time() 170 | print(f"LLM generation completed in {(end - start):.2f} seconds") 171 | 172 | st.session_state.freeform_messages.append({"role": "assistant", "content": f"{full_response}"}) 173 | -------------------------------------------------------------------------------- /ragnarok/requirements.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile with Python 3.13 3 | # by the following command: 4 | # 5 | # pip-compile 6 | # 7 | aiohappyeyeballs==2.6.1 8 | # via aiohttp 9 | aiohttp==3.12.14 10 | # via langchain-community 11 | aiosignal==1.4.0 12 | # via aiohttp 13 | altair==5.5.0 14 | # via 15 | # altex 16 | # streamlit 17 | altex==0.2.0 18 | # via 19 | # streamlit-extras 20 | # streamlit-faker 21 | annotated-types==0.7.0 22 | # via pydantic 23 | anyio==4.9.0 24 | # via httpx 25 | attrs==25.3.0 26 | # via 27 | # aiohttp 28 | # jsonschema 29 | # referencing 30 | beautifulsoup4==4.13.4 31 | # via favicon 32 | blinker==1.9.0 33 | # via streamlit 34 | cachetools==6.1.0 35 | # via streamlit 36 | certifi==2025.7.14 37 | # via 38 | # httpcore 39 | # httpx 40 | # requests 41 | cffi==1.17.1 42 | # via cryptography 43 | charset-normalizer==3.4.2 44 | # via requests 45 | click==8.2.1 46 | # via streamlit 47 | contourpy==1.3.2 48 | # via matplotlib 49 | cryptography==45.0.5 50 | # via streamlit-cookies-manager 51 | cycler==0.12.1 52 | # via matplotlib 53 | dataclasses-json==0.6.7 54 | # via langchain-community 55 | diskcache==5.6.3 56 | # via llama-cpp-python 57 | entrypoints==0.4 58 | # via streamlit-extras 59 | faker==37.4.2 60 | # via streamlit-faker 61 | favicon==0.7.0 62 | # via markdownlit 63 | filelock==3.18.0 64 | # via 65 | # huggingface-hub 66 | # torch 67 | # transformers 68 | fonttools==4.59.0 69 | # via matplotlib 70 | frozenlist==1.7.0 71 | # via 72 | # aiohttp 73 | # aiosignal 74 | fsspec==2025.7.0 75 | # via 76 | # huggingface-hub 77 | # torch 78 | gitdb==4.0.12 79 | # via gitpython 80 | gitpython==3.1.44 81 | # via streamlit 82 | greenlet==3.2.3 83 | # via sqlalchemy 84 | h11==0.16.0 85 | # via httpcore 86 | hf-xet==1.1.5 87 | # via huggingface-hub 88 | htbuilder==0.9.0 89 | # via 90 | # markdownlit 91 | # st-annotated-text 92 | # streamlit-extras 93 | httpcore==1.0.9 94 | # via httpx 95 | httpx==0.28.1 96 | # via 97 | # langsmith 98 | # ollama 99 | httpx-sse==0.4.1 100 | # via langchain-community 101 | huggingface-hub==0.33.4 102 | # via 103 | # -r requirements.in 104 | # sentence-transformers 105 | # tokenizers 106 | # transformers 107 | idna==3.10 108 | # via 109 | # anyio 110 | # httpx 111 | # requests 112 | # yarl 113 | jinja2==3.1.6 114 | # via 115 | # altair 116 | # llama-cpp-python 117 | # pydeck 118 | # streamlit-camera-input-live 119 | # streamlit-image-coordinates 120 | # streamlit-keyup 121 | # torch 122 | joblib==1.5.1 123 | # via scikit-learn 124 | jsonpatch==1.33 125 | # via langchain-core 126 | jsonpointer==3.0.0 127 | # via jsonpatch 128 | jsonschema==4.25.0 129 | # via altair 130 | jsonschema-specifications==2025.4.1 131 | # via jsonschema 132 | kiwisolver==1.4.8 133 | # via matplotlib 134 | langchain==0.3.26 135 | # via 136 | # -r requirements.in 137 | # langchain-community 138 | langchain-community==0.3.27 139 | # via -r requirements.in 140 | langchain-core==0.3.71 141 | # via 142 | # langchain 143 | # langchain-community 144 | # langchain-text-splitters 145 | langchain-text-splitters==0.3.8 146 | # via langchain 147 | langsmith==0.4.8 148 | # via 149 | # langchain 150 | # langchain-community 151 | # langchain-core 152 | llama-cpp-python==0.3.14 153 | # via -r requirements.in 154 | lxml==6.0.0 155 | # via markdownlit 156 | markdown==3.8.2 157 | # via 158 | # markdownlit 159 | # pymdown-extensions 160 | markdownlit==0.0.7 161 | # via streamlit-extras 162 | markupsafe==3.0.2 163 | # via jinja2 164 | marshmallow==3.26.1 165 | # via dataclasses-json 166 | matplotlib==3.10.3 167 | # via streamlit-faker 168 | mpmath==1.3.0 169 | # via sympy 170 | multidict==6.6.3 171 | # via 172 | # aiohttp 173 | # yarl 174 | mypy-extensions==1.1.0 175 | # via typing-inspect 176 | narwhals==1.48.0 177 | # via 178 | # altair 179 | # plotly 180 | networkx==3.5 181 | # via torch 182 | numpy==2.3.1 183 | # via 184 | # contourpy 185 | # langchain-community 186 | # llama-cpp-python 187 | # matplotlib 188 | # pandas 189 | # pydeck 190 | # scikit-learn 191 | # scipy 192 | # streamlit 193 | # transformers 194 | nvidia-cublas-cu12==12.6.4.1 195 | # via 196 | # nvidia-cudnn-cu12 197 | # nvidia-cusolver-cu12 198 | # torch 199 | nvidia-cuda-cupti-cu12==12.6.80 200 | # via torch 201 | nvidia-cuda-nvrtc-cu12==12.6.77 202 | # via torch 203 | nvidia-cuda-runtime-cu12==12.6.77 204 | # via torch 205 | nvidia-cudnn-cu12==9.5.1.17 206 | # via torch 207 | nvidia-cufft-cu12==11.3.0.4 208 | # via torch 209 | nvidia-cufile-cu12==1.11.1.6 210 | # via torch 211 | nvidia-curand-cu12==10.3.7.77 212 | # via torch 213 | nvidia-cusolver-cu12==11.7.1.2 214 | # via torch 215 | nvidia-cusparse-cu12==12.5.4.2 216 | # via 217 | # nvidia-cusolver-cu12 218 | # torch 219 | nvidia-cusparselt-cu12==0.6.3 220 | # via torch 221 | nvidia-nccl-cu12==2.26.2 222 | # via torch 223 | nvidia-nvjitlink-cu12==12.6.85 224 | # via 225 | # nvidia-cufft-cu12 226 | # nvidia-cusolver-cu12 227 | # nvidia-cusparse-cu12 228 | # torch 229 | nvidia-nvtx-cu12==12.6.77 230 | # via torch 231 | ollama==0.5.1 232 | # via -r requirements.in 233 | orjson==3.11.0 234 | # via langsmith 235 | packaging==25.0 236 | # via 237 | # altair 238 | # huggingface-hub 239 | # langchain-core 240 | # langsmith 241 | # marshmallow 242 | # matplotlib 243 | # plotly 244 | # streamlit 245 | # transformers 246 | pandas==2.3.1 247 | # via 248 | # altex 249 | # streamlit 250 | pillow==11.3.0 251 | # via 252 | # matplotlib 253 | # sentence-transformers 254 | # streamlit 255 | plotly==6.2.0 256 | # via streamlit-extras 257 | prometheus-client==0.22.1 258 | # via streamlit-extras 259 | propcache==0.3.2 260 | # via 261 | # aiohttp 262 | # yarl 263 | protobuf==6.31.1 264 | # via 265 | # streamlit 266 | # streamlit-extras 267 | pyarrow==21.0.0 268 | # via streamlit 269 | pycparser==2.22 270 | # via cffi 271 | pydantic==2.11.7 272 | # via 273 | # langchain 274 | # langchain-core 275 | # langsmith 276 | # ollama 277 | # pydantic-settings 278 | pydantic-core==2.33.2 279 | # via pydantic 280 | pydantic-settings==2.10.1 281 | # via langchain-community 282 | pydeck==0.9.1 283 | # via streamlit 284 | pymdown-extensions==10.16 285 | # via markdownlit 286 | pyparsing==3.2.3 287 | # via matplotlib 288 | python-dateutil==2.9.0.post0 289 | # via 290 | # matplotlib 291 | # pandas 292 | python-dotenv==1.1.1 293 | # via pydantic-settings 294 | pytz==2025.2 295 | # via pandas 296 | pyyaml==6.0.2 297 | # via 298 | # huggingface-hub 299 | # langchain 300 | # langchain-community 301 | # langchain-core 302 | # pymdown-extensions 303 | # transformers 304 | referencing==0.36.2 305 | # via 306 | # jsonschema 307 | # jsonschema-specifications 308 | regex==2024.11.6 309 | # via transformers 310 | requests==2.32.4 311 | # via 312 | # favicon 313 | # huggingface-hub 314 | # langchain 315 | # langchain-community 316 | # langsmith 317 | # requests-toolbelt 318 | # streamlit 319 | # transformers 320 | requests-toolbelt==1.0.0 321 | # via langsmith 322 | rpds-py==0.26.0 323 | # via 324 | # jsonschema 325 | # referencing 326 | safetensors==0.5.3 327 | # via transformers 328 | scikit-learn==1.7.1 329 | # via sentence-transformers 330 | scipy==1.16.0 331 | # via 332 | # scikit-learn 333 | # sentence-transformers 334 | sentence-transformers==5.0.0 335 | # via -r requirements.in 336 | six==1.17.0 337 | # via python-dateutil 338 | smmap==5.0.2 339 | # via gitdb 340 | sniffio==1.3.1 341 | # via anyio 342 | soupsieve==2.7 343 | # via beautifulsoup4 344 | sqlalchemy==2.0.41 345 | # via 346 | # langchain 347 | # langchain-community 348 | st-annotated-text==4.0.2 349 | # via streamlit-extras 350 | st-theme==1.2.3 351 | # via streamlit-extras 352 | streamlit==1.47.0 353 | # via 354 | # -r requirements.in 355 | # altex 356 | # markdownlit 357 | # st-theme 358 | # streamlit-avatar 359 | # streamlit-camera-input-live 360 | # streamlit-card 361 | # streamlit-cookies-manager 362 | # streamlit-embedcode 363 | # streamlit-extras 364 | # streamlit-faker 365 | # streamlit-image-coordinates 366 | # streamlit-keyup 367 | # streamlit-toggle-switch 368 | # streamlit-vertical-slider 369 | streamlit-avatar==0.1.3 370 | # via streamlit-extras 371 | streamlit-camera-input-live==0.2.0 372 | # via streamlit-extras 373 | streamlit-card==1.0.2 374 | # via streamlit-extras 375 | streamlit-cookies-manager==0.2.0 376 | # via -r requirements.in 377 | streamlit-embedcode==0.1.2 378 | # via streamlit-extras 379 | streamlit-extras==0.6.0 380 | # via 381 | # -r requirements.in 382 | # markdownlit 383 | streamlit-faker==0.0.4 384 | # via streamlit-extras 385 | streamlit-image-coordinates==0.1.9 386 | # via streamlit-extras 387 | streamlit-keyup==0.3.0 388 | # via streamlit-extras 389 | streamlit-toggle-switch==1.0.2 390 | # via streamlit-extras 391 | streamlit-vertical-slider==2.5.5 392 | # via streamlit-extras 393 | sympy==1.14.0 394 | # via torch 395 | tenacity==9.1.2 396 | # via 397 | # langchain-community 398 | # langchain-core 399 | # streamlit 400 | threadpoolctl==3.6.0 401 | # via scikit-learn 402 | tokenizers==0.21.2 403 | # via transformers 404 | toml==0.10.2 405 | # via streamlit 406 | torch==2.7.1 407 | # via sentence-transformers 408 | tornado==6.5.1 409 | # via streamlit 410 | tqdm==4.67.1 411 | # via 412 | # huggingface-hub 413 | # sentence-transformers 414 | # transformers 415 | transformers==4.53.3 416 | # via 417 | # -r requirements.in 418 | # sentence-transformers 419 | triton==3.3.1 420 | # via torch 421 | typing-extensions==4.14.1 422 | # via 423 | # altair 424 | # beautifulsoup4 425 | # huggingface-hub 426 | # langchain-core 427 | # llama-cpp-python 428 | # pydantic 429 | # pydantic-core 430 | # sentence-transformers 431 | # sqlalchemy 432 | # streamlit 433 | # torch 434 | # typing-inspect 435 | # typing-inspection 436 | typing-inspect==0.9.0 437 | # via dataclasses-json 438 | typing-inspection==0.4.1 439 | # via 440 | # pydantic 441 | # pydantic-settings 442 | tzdata==2025.2 443 | # via 444 | # faker 445 | # pandas 446 | urllib3==2.5.0 447 | # via requests 448 | validators==0.35.0 449 | # via streamlit-extras 450 | watchdog==6.0.0 451 | # via streamlit 452 | yarl==1.20.1 453 | # via aiohttp 454 | zstandard==0.23.0 455 | # via langsmith 456 | 457 | # The following packages are considered to be unsafe in a requirements file: 458 | # setuptools 459 | -------------------------------------------------------------------------------- /ragnarok/RAGnarok_Settings.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from requests.auth import HTTPBasicAuth 3 | import time 4 | # from langchain_community.embeddings import HuggingFaceEmbeddings 5 | # from transformers import AutoModelForSequenceClassification, AutoTokenizer 6 | import streamlit as st 7 | from streamlit_cookies_manager import CookieManager 8 | import torch 9 | import urllib3 10 | import urllib 11 | urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) 12 | torch.classes.__path__ = [] 13 | 14 | cookies = CookieManager() 15 | while not cookies.ready(): 16 | time.sleep(1) 17 | 18 | if torch.cuda.is_available(): 19 | device = "cuda" 20 | elif torch.backends.mps.is_available(): 21 | device = "mps" 22 | else: 23 | device = "cpu" 24 | 25 | columns = st.columns(2) 26 | with columns[0]: 27 | st.image("./img/ragnarok.png") 28 | st.markdown("#### *[RAGnarok](https://github.com/GhostPack/RAGnarok) is a Retrieval-Augmented Generation LLM ChatBot powered by a Nemesis instance.*") 29 | st.info("**NOTE**: RAGnarok does not have conversational memory, each question is independent.") 30 | st.divider() 31 | 32 | 33 | def wait_for_nemesis(nemesis_url, nemesis_user, nemesis_password, wait_timeout = 3): 34 | retries = 3 35 | success = False 36 | while retries > 0: 37 | try: 38 | if nemesis_url.startswith("https://"): 39 | result = requests.get( 40 | f"{nemesis_url}nlp/ready", 41 | auth=HTTPBasicAuth(nemesis_user, nemesis_password), 42 | timeout=wait_timeout, 43 | verify=False 44 | ) 45 | else: 46 | result = requests.get( 47 | f"{nemesis_url}nlp/ready", 48 | auth=HTTPBasicAuth(nemesis_user, nemesis_password), 49 | timeout=wait_timeout 50 | ) 51 | if result.status_code == 401: 52 | st.error(f"Invalid Nemesis credentials!") 53 | elif result.status_code != 200: 54 | st.warning(f"Error connecting to Nemesis instance {nemesis_url}: '{result.status_code}'") 55 | else: 56 | success = True 57 | retries = 0 58 | st.success("Successfully connected to Nemesis instance!") 59 | break 60 | except Exception as e: 61 | st.warning(f"Encountered an exception while trying to connect to Nemesis instance {nemesis_url}: '{e}', trying again in {wait_timeout} seconds...") 62 | time.sleep(wait_timeout) 63 | retries = retries - 1 64 | continue 65 | if not success: 66 | st.error(f"Error connecting to {nemesis_url}, please check your connection/credentials!") 67 | 68 | default_value = cookies["nemesis_url"] if "nemesis_url" in cookies else "" 69 | nemesis_url = st.text_input( 70 | label="Nemesis URL", 71 | help="The Nemesis endpoint", 72 | value=default_value 73 | ) 74 | cols = st.columns(2) 75 | with cols[0]: 76 | default_value = cookies["nemesis_username"] if "nemesis_username" in cookies else "" 77 | nemesis_username = st.text_input( 78 | label="Nemesis Username", 79 | help="This is the `basic_auth_user` value in nemesis.config.", 80 | value=default_value 81 | ) 82 | with cols[1]: 83 | default_value = cookies["nemesis_password"] if "nemesis_password" in cookies else "" 84 | nemesis_password = st.text_input( 85 | label="Nemesis Password", 86 | help="This is the `basic_auth_password` value in nemesis.config.", 87 | value=default_value 88 | ) 89 | if nemesis_url and nemesis_username and nemesis_password: 90 | if not nemesis_url.endswith("/"): 91 | nemesis_url = f"{nemesis_url}/" 92 | wait_for_nemesis(nemesis_url, nemesis_username, nemesis_password) 93 | cookies["nemesis_url"] = nemesis_url 94 | cookies["nemesis_username"] = nemesis_username 95 | cookies["nemesis_password"] = nemesis_password 96 | st.divider() 97 | 98 | if "mode" not in cookies: 99 | cookies["mode"] = "Local LLM" # Default mode 100 | 101 | if "mode_index" not in st.session_state: 102 | st.session_state["mode_index"] = 0 # Default to the first mode 103 | def get_mode_index(cookies): 104 | if "mode" in cookies: 105 | if cookies["mode"] == "Local LLM": 106 | return 0 107 | elif cookies["mode"] == "Remote Ollama Server (Local Reranker)": 108 | return 1 109 | elif cookies["mode"] == "Remote Ollama Server (No Reranker)": 110 | return 2 111 | return 0 # Default to 0 if no mode is set 112 | 113 | def on_change_mode_selectbox(): 114 | # Update cookies and session state 115 | cookies["mode"] = st.session_state["selected_mode"] 116 | st.session_state["mode_index"] = get_mode_index(cookies) 117 | print(f"Mode changed to: {cookies['mode']} (index: {st.session_state['mode_index']})") 118 | 119 | # Selectbox for mode selection 120 | mode = st.selectbox( 121 | label='Ragnarok Mode', 122 | options=('Local LLM', 'Remote Ollama Server (Local Reranker)', 'Remote Ollama Server (No Reranker)'), 123 | help="The mode you want to run RAGnarok in.", 124 | index=st.session_state["mode_index"], # Use session state for the index 125 | on_change=on_change_mode_selectbox, 126 | key="selected_mode" # Bind to session state 127 | ) 128 | # Update mode_index based on session state 129 | mode_index = st.session_state["mode_index"] 130 | 131 | if mode_index == 0: 132 | cols = st.columns(2) 133 | with cols[0]: 134 | default_index = 0 135 | if "llm_model" in cookies: 136 | if "neural-chat" in cookies["llm_model"].lower(): 137 | default_index = 0 138 | elif "openchat" in cookies["llm_model"].lower(): 139 | default_index = 1 140 | elif "starling" in cookies["llm_model"].lower(): 141 | default_index = 2 142 | llm_model = st.selectbox( 143 | label='LLM model to use', 144 | options=('Intel/neural-chat-7b-v3-3', 'openchat-3.5-0106', 'Starling-LM-7B-alpha'), 145 | help="The core LLM to use for chat over retrieved document snippets.", 146 | index=default_index 147 | ) 148 | cookies["llm_model"] = llm_model 149 | with cols[1]: 150 | llm_temperature_default_value = float(cookies["llm_temperature"]) if "llm_temperature" in cookies else 0.1 151 | llm_temperature = st.slider( 152 | label="LLM Temperature", 153 | min_value=0.0, 154 | max_value=1.0, 155 | value=llm_temperature_default_value, 156 | help="The temperate for the core LLM. Higher means more 'creative', lower means more repeatable." 157 | ) 158 | cookies["llm_temperature"] = llm_temperature 159 | cols = st.columns(2) 160 | with cols[0]: 161 | default_index = 0 162 | if "reranking_model" in cookies: 163 | if cookies["reranking_model"] == "Harmj0y/nemesis-reranker": 164 | default_index = 0 165 | elif cookies["reranking_model"] == "BAAI/bge-reranker-base": 166 | default_index = 1 167 | reranking_model = st.selectbox( 168 | label='Reranking model to use', 169 | options=('nemesis-reranker', 'bge-reranker-base',), 170 | help="Model to use to rerank results before sending to the LLM.", 171 | index=default_index 172 | ) 173 | if reranking_model == "nemesis-reranker": 174 | reranking_model = "Harmj0y/nemesis-reranker" 175 | elif reranking_model == "bge-reranker-base": 176 | reranking_model = "BAAI/bge-reranker-base" 177 | cookies["reranking_model"] = reranking_model 178 | with cols[1]: 179 | if device == "cuda" or device == "mps": 180 | default_n_gpu_layers = int(cookies["n_gpu_layers"]) if "n_gpu_layers" in cookies else 4 181 | n_gpu_layers = st.slider( 182 | label="Number of GPU layers to offload to the GPU", 183 | min_value=1, 184 | max_value=32, 185 | value=default_n_gpu_layers, 186 | help="Number of GPU layers to offload to the GPU. More _usually_ means faster generation, but may cause out-of-memory errors." 187 | ) 188 | cookies["n_gpu_layers"] = n_gpu_layers 189 | 190 | elif mode_index == 1: 191 | cols = st.columns(1) 192 | with cols[0]: 193 | ollama_url_default_value = cookies["ollama_url"] if "ollama_url" in cookies else "http://localhost:11434" 194 | ollama_url = st.text_input( 195 | label="Ollama Server URL", 196 | help="Ollama Server URL (e.g. http://localhost:11434)", 197 | value=ollama_url_default_value 198 | ) 199 | cookies["ollama_url"] = ollama_url 200 | 201 | cols = st.columns(1) 202 | with cols[0]: 203 | ollama_model_default_value = cookies["ollama_model"] if "ollama_model" in cookies else "" 204 | ollama_model = st.text_input( 205 | label="Ollama Model", 206 | help="The LLM you will use from Ollama (e.g. llama3.3:70b)", 207 | value=ollama_model_default_value 208 | ) 209 | cookies["ollama_model"] = ollama_model 210 | 211 | 212 | cols = st.columns(2) 213 | with cols[0]: 214 | default_index = 0 215 | if "reranking_model" in cookies: 216 | if cookies["reranking_model"] == "Harmj0y/nemesis-reranker": 217 | default_index = 0 218 | elif cookies["reranking_model"] == "BAAI/bge-reranker-base": 219 | default_index = 1 220 | reranking_model = st.selectbox( 221 | label='Reranking model to use', 222 | options=('nemesis-reranker', 'bge-reranker-base',), 223 | help="Model to use to rerank results before sending to the LLM.", 224 | index=default_index 225 | ) 226 | if reranking_model == "nemesis-reranker": 227 | reranking_model = "Harmj0y/nemesis-reranker" 228 | elif reranking_model == "bge-reranker-base": 229 | reranking_model = "BAAI/bge-reranker-base" 230 | cookies["reranking_model"] = reranking_model 231 | with cols[1]: 232 | if device == "cuda" or device == "mps": 233 | default_n_gpu_layers = int(cookies["n_gpu_layers"]) if "n_gpu_layers" in cookies else 4 234 | n_gpu_layers = st.slider( 235 | label="Number of GPU layers to offload to the GPU", 236 | min_value=1, 237 | max_value=32, 238 | value=default_n_gpu_layers, 239 | help="Number of GPU layers to offload to the GPU. More _usually_ means faster generation, but may cause out-of-memory errors." 240 | ) 241 | cookies["n_gpu_layers"] = n_gpu_layers 242 | 243 | elif mode_index == 2: 244 | cols = st.columns(1) 245 | with cols[0]: 246 | ollama_url_default_value = cookies["ollama_url"] if "ollama_url" in cookies else "http://localhost:11434" 247 | ollama_url = st.text_input( 248 | label="Ollama Server URL", 249 | help="Ollama Server URL (e.g. http://localhost:11434)", 250 | value=ollama_url_default_value 251 | ) 252 | cookies["ollama_url"] = ollama_url 253 | cols = st.columns(1) 254 | with cols[0]: 255 | ollama_model_default_value = cookies["ollama_model"] if "ollama_model" in cookies else "" 256 | ollama_model = st.text_input( 257 | label="Ollama Model", 258 | help="The LLM you will use from Ollama (e.g. llama3.3:70b)", 259 | value=ollama_model_default_value 260 | ) 261 | cookies["ollama_model"] = ollama_model 262 | 263 | 264 | cols = st.columns(1) 265 | with cols[0]: 266 | k_similarity_default_value = int(cookies["k_similarity"]) if "k_similarity" in cookies else 30 267 | k_similarity = st.slider( 268 | label="Initial K search", 269 | min_value=1, 270 | max_value=100, 271 | value=k_similarity_default_value, 272 | help="The number of similar indexed documents to pull from the Nemesis backend before performing reranking. More documents casts a wide net but takes more time." 273 | ) 274 | cookies["k_similarity"] = k_similarity 275 | 276 | cols = st.columns(2) 277 | with cols[0]: 278 | min_doc_results_default_value = int(cookies["min_doc_results"]) if "min_doc_results" in cookies else 1 279 | min_doc_results = st.slider( 280 | label="Minimum number of documents to supply to the LLM", 281 | min_value=1, 282 | max_value=15, 283 | value=min_doc_results_default_value, 284 | help="The minimum number of document results to feed to the LLM's context. More means slower response generation, higher provides more context (but possibly more irrelevant information)." 285 | ) 286 | cookies["min_doc_results"] = min_doc_results 287 | with cols[1]: 288 | max_doc_results_default_value = int(cookies["max_doc_results"]) if "max_doc_results" in cookies else 5 289 | max_doc_results = st.slider( 290 | label="Maximum number of documents to supply to the LLM", 291 | min_value=1, 292 | max_value=15, 293 | value=max_doc_results_default_value, 294 | help="The maximum number of document results to feed to the LLM's context. More means slower response generation, higher provides more context (but possibly more irrelevant information)." 295 | ) 296 | cookies["max_doc_results"] = max_doc_results 297 | 298 | st.divider() 299 | 300 | if st.button("Clear Cookies?"): 301 | cookies.clear() 302 | 303 | st.divider() 304 | 305 | cookies.save() -------------------------------------------------------------------------------- /ragnarok/pages/1_RAGnarok_Chat.py: -------------------------------------------------------------------------------- 1 | import time 2 | import ntpath 3 | import os.path 4 | import requests 5 | import json 6 | from requests.auth import HTTPBasicAuth 7 | 8 | from huggingface_hub import hf_hub_download 9 | from transformers import AutoModelForSequenceClassification, AutoTokenizer 10 | from llama_cpp import Llama 11 | import numpy as np 12 | import streamlit as st 13 | from streamlit_cookies_manager import CookieManager 14 | import torch 15 | import asyncio 16 | from ollama import AsyncClient 17 | 18 | 19 | 20 | cookies = CookieManager() 21 | while not cookies.ready(): 22 | time.sleep(1) 23 | 24 | if "mode" not in cookies: 25 | st.error("Your cookies are broken, please go back to the main settings page.") 26 | st.stop() 27 | 28 | if "mode_index" not in st.session_state: 29 | st.session_state["mode_index"] = 0 # Default to the first mode 30 | def get_mode_index(cookies): 31 | if "mode" in cookies: 32 | if cookies["mode"] == "Local LLM": 33 | return 0 34 | elif cookies["mode"] == "Remote Ollama Server (Local Reranker)": 35 | return 1 36 | elif cookies["mode"] == "Remote Ollama Server (No Reranker)": 37 | return 2 38 | return 0 # Default to 0 if no mode is set 39 | 40 | mode_index = get_mode_index(cookies) 41 | 42 | if mode_index == 0: 43 | if "llm_model" not in cookies and "ollama_model" not in cookies: 44 | st.error("Please select a LLM model on the main settings page.") 45 | st.stop() 46 | if "llm_temperature" not in cookies: 47 | st.error("Please select a LLM model temperature on the main settings page.") 48 | st.stop() 49 | 50 | if mode_index == 0 or mode_index == 1: 51 | if "reranking_model" not in cookies: 52 | st.error("Please select an reranking model on the main settings page.") 53 | st.stop() 54 | if mode_index == 1 or mode_index == 2: 55 | if "ollama_url" not in cookies: 56 | st.error("Please select an Ollama server on the main settings page.") 57 | st.stop() 58 | if "ollama_model" not in cookies: 59 | st.error("Please select an Ollama model on the main settings page.") 60 | st.stop() 61 | if cookies["ollama_model"] == "": 62 | st.error("Please select an Ollama model on the main settings page.") 63 | st.stop() 64 | 65 | if "nemesis_url" not in cookies or "nemesis_username" not in cookies or "nemesis_password" not in cookies: 66 | st.error("Please enter Nemesis connection info on the main settings page.") 67 | st.stop() 68 | 69 | if "k_similarity" not in cookies: 70 | st.error("Please select an k_similarity on the main settings page.") 71 | st.stop() 72 | 73 | if "min_doc_results" not in cookies: 74 | st.error("Please select an min_doc_results on the main settings page.") 75 | st.stop() 76 | 77 | if "max_doc_results" not in cookies: 78 | st.error("Please select an max_doc_results on the main settings page.") 79 | st.stop() 80 | 81 | 82 | 83 | 84 | ######################################################## 85 | # 86 | # Model Delarations 87 | # 88 | ######################################################## 89 | n_gpu_layers = 0 90 | temp = "" 91 | min_doc_results = int(cookies['min_doc_results']) 92 | max_doc_results = int(cookies["max_doc_results"]) 93 | min_similarity_score = 0 94 | 95 | @st.cache_resource 96 | def get_llm(llm_model_path, n_gpu_layers): 97 | llm = Llama( 98 | model_path=llm_model_path, 99 | n_ctx=8192, 100 | n_gpu_layers=n_gpu_layers, 101 | verbose=False 102 | ) 103 | return llm 104 | 105 | @st.cache_resource 106 | def get_ollama_client(ollama_url): 107 | client = AsyncClient(ollama_url) 108 | return client 109 | 110 | # either local LLM or remote Ollama server with local reranker 111 | if mode_index == 0 or mode_index == 1: 112 | # check for GPU presence 113 | if torch.cuda.is_available(): 114 | # traditional Nvidia cuda GPUs 115 | device = torch.device("cuda:0") 116 | n_gpu_layers = int(cookies["n_gpu_layers"]) 117 | elif torch.backends.mps.is_available(): 118 | # for macOS M1/M2s 119 | device = torch.device("mps") 120 | n_gpu_layers = int(cookies["n_gpu_layers"]) 121 | else: 122 | device = torch.device("cpu") 123 | n_gpu_layers = 0 124 | 125 | @st.cache_resource 126 | def get_reranker(reranking_model, device): 127 | rerank_tokenizer = AutoTokenizer.from_pretrained(reranking_model) 128 | print(f"device: {device}") 129 | rerank_model = AutoModelForSequenceClassification.from_pretrained(reranking_model).to(device) 130 | return (rerank_tokenizer, rerank_model) 131 | temp = cookies["reranking_model"] 132 | with st.spinner(f"Downloading/loading reranking model {temp} ..."): 133 | (rerank_tokenizer, rerank_model) = get_reranker(cookies["reranking_model"], device) 134 | 135 | # only local LLM 136 | if mode_index == 0: 137 | llm_generation_kwargs = { 138 | "max_tokens": 512, 139 | "stream": True, 140 | "temperature": float(cookies["llm_temperature"]), 141 | "echo": False 142 | } 143 | try: 144 | if cookies["llm_model"] == "Intel/neural-chat-7b-v3-3": 145 | llm_model_path = hf_hub_download("TheBloke/neural-chat-7B-v3-3-GGUF", filename="neural-chat-7b-v3-3.Q5_K_M.gguf", local_files_only=True) 146 | elif cookies["llm_model"] == "openchat-3.5-0106": 147 | llm_model_path = hf_hub_download("TheBloke/openchat-3.5-0106-GGUF", filename="openchat-3.5-0106.Q5_K_M.gguf", local_files_only=True) 148 | elif cookies["llm_model"] == "Starling-LM-7B-alpha": 149 | llm_model_path = hf_hub_download("TheBloke/Starling-LM-7B-alpha-GGUF", filename="starling-lm-7b-alpha.Q5_K_M.gguf", local_files_only=True) 150 | else: 151 | llm_model = cookies["llm_model"] 152 | st.error(f"Invalid llm_model: {llm_model}") 153 | except: 154 | with st.spinner("Downloading LLM model (this will take some time)..."): 155 | if cookies["llm_model"] == "Intel/neural-chat-7b-v3-3": 156 | llm_model_path = hf_hub_download("TheBloke/neural-chat-7B-v3-3-GGUF", filename="neural-chat-7b-v3-3.Q5_K_M.gguf") 157 | elif cookies["llm_model"] == "openchat-3.5-0106": 158 | llm_model_path = hf_hub_download("TheBloke/openchat-3.5-0106-GGUF", filename="openchat-3.5-0106.Q5_K_M.gguf") 159 | elif cookies["llm_model"] == "Starling-LM-7B-alpha": 160 | llm_model_path = hf_hub_download("TheBloke/Starling-LM-7B-alpha-GGUF", filename="starling-lm-7b-alpha.Q5_K_M.gguf") 161 | else: 162 | llm_model = cookies["llm_model"] 163 | st.error(f"Invalid llm_model_path: {llm_model}") 164 | 165 | llm = get_llm(llm_model_path, n_gpu_layers) 166 | 167 | 168 | with st.sidebar: 169 | if mode_index == 0 or mode_index == 1: 170 | if torch.cuda.is_available(): 171 | st.success("Using CUDA GPU!") 172 | elif torch.backends.mps.is_available(): 173 | st.success("Using MPS GPU!") 174 | else: 175 | st.warning("No GPU detected, generation may be slow!") 176 | file_path_include = st.text_input("Enter file name/path pattern to include in the initial search:") 177 | file_path_exclude = st.text_input("Enter file name/path pattern to exclude from the initial search:") 178 | st.write("**Note**: _wildcard == \\*, use | to separate multiple terms, e.g. C:\\Temp\\\\*|\\*.pdf_") 179 | st.write("_You MUST surround any file name with \\*'s, i.e., \*A_Process_is_No_One.pdf\*_") 180 | 181 | st.title("RAGnarok Chat") 182 | st.warning('*WARNING: results not guaranteed to be correct! Verify answers with the supplied sources.*', icon="⚠️") 183 | 184 | if "private_messages" not in st.session_state: 185 | st.session_state.private_messages = [] 186 | 187 | for message in st.session_state.private_messages: 188 | with st.chat_message(message["role"]): 189 | st.markdown(message["content"]) 190 | 191 | if prompt := st.chat_input(""): 192 | st.session_state.private_messages.append({"role": "user", "content": prompt}) 193 | with st.chat_message("user"): 194 | st.markdown(prompt) 195 | 196 | with st.chat_message("assistant"): 197 | message_placeholder = st.empty() 198 | full_response = "" 199 | results = [] 200 | 201 | try: 202 | # data = { 203 | # "search_phrase": prompt, 204 | # "num_results": cookies['k_similarity'], 205 | # } 206 | 207 | # if file_path_include: 208 | # data["file_path_include"] = file_path_include 209 | # if file_path_exclude: 210 | # data["file_path_exclude"] = file_path_exclude 211 | 212 | # TODO: change this to query the local llm or remote Ollama server to get a list of search terms, then use those terms to query Nemesis 213 | 214 | client = get_ollama_client(cookies["ollama_url"]) 215 | single_turn_nlp_prompt = f"### System:\nYou are a helpful LLM who knows how to distil user queries down to an individual search term. The user is going to ask you a question, your job is to determine what the best search term is based on the users question. Only respond with the search term, nothing else. Do not preface the response with classifiers like 'search string' or 'search_term', only output the term itself. \n### User:\n{prompt}\n### Assistant:\n" 216 | async def nlp_term_chat(): 217 | nlp_search_term_full_response = "" 218 | message = {'role': 'user', 'content': single_turn_nlp_prompt} 219 | async for part in await AsyncClient().chat(model=cookies['ollama_model'], messages=[message], stream=True): 220 | nlp_search_term_full_response += part['message']['content'] 221 | return nlp_search_term_full_response 222 | nlp_search_term = asyncio.run(nlp_term_chat()) 223 | 224 | 225 | url = "{}/hasura/v1/graphql".format(cookies["nemesis_url"]) 226 | payload = json.dumps({ 227 | "query": "\n query SearchDocuments($searchQuery: String!, $pathPattern: String!, $agentPattern: String!, $project: String, $startDate: timestamptz, $endDate: timestamptz) {\n search_documents(\n args: {\n search_query: $searchQuery,\n path_pattern: $pathPattern,\n agent_pattern: $agentPattern,\n project_name: $project,\n start_date: $startDate,\n end_date: $endDate,\n max_results: 100\n }\n ) {\n object_id\n chunk_number\n content\n file_name\n path\n extension\n project\n agent_id\n timestamp\n }\n }\n ", 228 | "variables": { 229 | "searchQuery": nlp_search_term, 230 | "pathPattern": "%", 231 | "agentPattern": "%", 232 | "project": None, 233 | "startDate": None, 234 | "endDate": None 235 | } 236 | }) 237 | # TODO: add the X-Hasura-Admin-Secret to the settings page and use that instead of hardcoding it 238 | headers = { 239 | 'Content-Type': 'application/json', 240 | 'X-Hasura-Admin-Secret': 'pass456', 241 | } 242 | 243 | response = requests.request("POST", url, headers=headers, data=payload, verify=False) 244 | if response.status_code != 200: 245 | st.error(f"Error calling Nemesis GraphQL API: {response.status_code} - {response.text}") 246 | st.stop() 247 | else: 248 | results = response.json() 249 | if len(results["data"]["search_documents"]) == 0: 250 | st.error(f"No documents found matching the search criteria: {nlp_search_term}") 251 | st.stop() 252 | else: 253 | results = results["data"]["search_documents"] 254 | print(f"Found {len(results)} documents matching the search criteria.") 255 | 256 | 257 | 258 | # url = f'{cookies["nemesis_url"]}nlp/hybrid_search' 259 | # with st.spinner("Searching for initial documents from Nemesis..."): 260 | # if url.startswith("https://"): 261 | # response = requests.get( 262 | # url, 263 | # json=data, 264 | # auth=HTTPBasicAuth(cookies["nemesis_username"], cookies["nemesis_password"]), 265 | # verify=False 266 | # ) 267 | # else: 268 | # response = requests.get( 269 | # url, 270 | # json=data, 271 | # auth=HTTPBasicAuth(cookies["nemesis_username"], cookies["nemesis_password"]) 272 | # ) 273 | # if response.status_code == 200: 274 | # results = response.json() 275 | # if "error" in results: 276 | # if results["error"] == "index_not_found_exception": 277 | # st.error(f"No documents have been indexed!") 278 | # else: 279 | # error = results["error"] 280 | # st.error(f"Error calling text search {url} with search_phrase '{prompt}' : {error}") 281 | # st.stop() 282 | # else: 283 | # st.error(f"Error calling text search {url} with search_phrase '{prompt}' : {response.status_code}") 284 | # st.stop() 285 | 286 | except Exception as e: 287 | st.error(f"Error calling text search {url} with search_phrase '{prompt}' : {e}") 288 | st.stop() 289 | 290 | documents = results 291 | if mode_index == 0 or mode_index == 1: 292 | # Reranking process 293 | pairs = [] 294 | for document in documents: 295 | pairs += [[prompt, document["text"].replace('"""', '"').replace("'''", "'")]] 296 | 297 | start = time.time() 298 | inputs = rerank_tokenizer( 299 | pairs, 300 | padding=True, 301 | truncation=True, 302 | return_tensors="pt", 303 | max_length=512, 304 | ).to(device) 305 | print(inputs) 306 | with st.spinner("Reranking documents..."): 307 | rerank_scores = (rerank_model(**inputs, return_dict=True).logits.view(-1,).float()).tolist() 308 | end = time.time() 309 | print(f"Reranker evaluated in {(end - start):.2f} seconds") 310 | 311 | final_results = [] 312 | 313 | for i in range(len(documents)): 314 | document = documents[i] 315 | rerank_score = rerank_scores[i] if (mode_index == 0 or mode_index == 1) else 0.1 316 | originating_file_name = ntpath.basename(document["path"]) 317 | originating_object_id = document["object_id"] 318 | nemesis_url = cookies["nemesis_url"] 319 | # link to the file in Nemesis 320 | file_page_link = f"{nemesis_url}files/{originating_object_id}" 321 | # snippet_id = document["id"] 322 | # direct link to the snippet 323 | # snippet_link = f"{nemesis_url}/kibana/app/discover#/?_a=(filters:!((query:(match_phrase:(_id:'{snippet_id}')))),index:'45fbaacb-9ef9-4cd1-b837-bc8ab0448220')&_g=(time:(from:now-1y%2Fd,to:now))" 324 | 325 | final_results += [ 326 | [ 327 | rerank_score, 328 | originating_file_name, 329 | file_page_link, 330 | # snippet_link, 331 | document["content"].replace('"""', '"').replace("'''", "'") 332 | ] 333 | ] 334 | 335 | # ensure results are sorted with the highest similiary score first 336 | final_results.sort(key=lambda x: x[0], reverse=True) 337 | 338 | # start at the minimum number of documents required 339 | i_range = min_doc_results if min_doc_results < len(final_results) else len(final_results) 340 | sources = final_results[0:i_range] 341 | 342 | responses_generated = False 343 | 344 | # add in results over the minimum similiary score 345 | for result in final_results[i_range:]: 346 | if result[0] > min_similarity_score: 347 | sources.append(result) 348 | 349 | # finally, cap the final results so we don't go over the LLM context 350 | # default == 15 (15*510tokens == 7650, leaving room for prompt overhead) 351 | sources = sources[:max_doc_results] 352 | 353 | print(f"\nNumber of source snippets: {len(sources)}\n") 354 | 355 | sources_formatted = [] 356 | for i in range(len(sources)): 357 | source = sources[i] 358 | sources_formatted += [f"{i+1}. [{source[1]}]({source[2]}) (score: {source[0]:.2f})"] 359 | sources_formatted_final = "\n".join(sources_formatted) 360 | 361 | ####################################################### 362 | # 363 | # LLM prompting 364 | # 365 | ####################################################### 366 | 367 | template = """ 368 | You are a helpful LLM who knows how to reason over source text blocks. Generate a coherent and informative response based on the following source blocks. 369 | 370 | Each Source Block starts with the Source Block number, followed by a Similarity Score reflecting the block's relevance to the overall prompt, followed by the originating Filename, finally followed by the source Text itself. 371 | 372 | The similarity scores represent the model's confidence in the relevance of each source block. Higher scores indicate higher perceived similarity. Utilize the information in all source blocks to enhance your answer, but if any source blocks contain contradictory information use the information from the source block with the higher Similarity Score. 373 | 374 | Only answer questions using the source below and if you're not sure of an answer, you can say "I don't know based on the retrieved sources". 375 | """ 376 | 377 | for i in range(len(sources)): 378 | # final_result_score, originating_file_name, file_page_link, snippet_link, final_result_text = sources[i] 379 | final_result_score, originating_file_name, file_page_link, final_result_text = sources[i] 380 | 381 | template += f""" 382 | Source Block: {i+1} 383 | Similarity Score: {final_result_score} 384 | Filename: {originating_file_name} 385 | Text: 386 | \"\"\" 387 | {final_result_text} 388 | \"\"\" 389 | 390 | --- 391 | """ 392 | 393 | template += f""" 394 | Question: {prompt} 395 | """ 396 | 397 | if "neural-chat" in cookies["llm_model"]: 398 | single_turn_prompt = f"### System:\nYou are a helpful LLM who knows how to reason over source text blocks.\n### User:\n{template}\n### Assistant:\n" 399 | else: 400 | single_turn_prompt = f"GPT4 Correct User: {template}<|end_of_turn|>GPT4 Correct Assistant:" 401 | 402 | with st.spinner("LLM is processing the prompt..."): 403 | start = time.time() 404 | if mode_index == 0: 405 | stream = llm.create_completion(single_turn_prompt, **llm_generation_kwargs) 406 | for output in stream: 407 | full_response += (output['choices'][0]['text'] or "").split("### Assistant:\n")[-1] 408 | message_placeholder.markdown(full_response + "▌") 409 | else: 410 | client = get_ollama_client(cookies["ollama_url"]) 411 | async def chat(): 412 | full_response = "" 413 | message = {'role': 'user', 'content': single_turn_prompt} 414 | async for part in await AsyncClient().chat(model=cookies['ollama_model'], messages=[message], stream=True): 415 | print(part['message']['content'], end='', flush=True) 416 | full_response += part['message']['content'] 417 | return full_response 418 | full_response = asyncio.run(chat()) 419 | 420 | 421 | end = time.time() 422 | 423 | message_placeholder.markdown(f"{full_response}\n\n*Sources:*\n\nSearch Term: {nlp_search_term}\n\n{sources_formatted_final}\n\n_Generation time: {(end - start):.2f} seconds_\n") 424 | 425 | print(f"LLM generation completed in {(end - start):.2f} seconds") 426 | 427 | st.session_state.private_messages.append({"role": "assistant", "content": f"{full_response}\n\n*Sources:*\n{sources_formatted_final}\n\n_Generation time: {(end - start):.2f} seconds_\n"}) 428 | --------------------------------------------------------------------------------