├── .gitignore ├── imgs ├── qdurllm.png ├── tutorial1.png ├── tutorial2.png └── tutorial3.png ├── app ├── __pycache__ │ ├── rag.cpython-312.pyc │ ├── loadUrls.cpython-312.pyc │ └── texInference.cpython-312.pyc ├── loadUrls.py ├── texInference.py ├── app.py └── rag.py ├── compose.yaml ├── docs ├── about.md ├── contributing.md └── index.md ├── mkdocs.yaml ├── LICENSE ├── CONTRIBUTING.md ├── README.md └── environment.yml /.gitignore: -------------------------------------------------------------------------------- 1 | */.cache 2 | application/flagged 3 | qdrant_storage/ -------------------------------------------------------------------------------- /imgs/qdurllm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AstraBert/qdurllm/HEAD/imgs/qdurllm.png -------------------------------------------------------------------------------- /imgs/tutorial1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AstraBert/qdurllm/HEAD/imgs/tutorial1.png -------------------------------------------------------------------------------- /imgs/tutorial2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AstraBert/qdurllm/HEAD/imgs/tutorial2.png -------------------------------------------------------------------------------- /imgs/tutorial3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AstraBert/qdurllm/HEAD/imgs/tutorial3.png -------------------------------------------------------------------------------- /app/__pycache__/rag.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AstraBert/qdurllm/HEAD/app/__pycache__/rag.cpython-312.pyc -------------------------------------------------------------------------------- /app/__pycache__/loadUrls.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AstraBert/qdurllm/HEAD/app/__pycache__/loadUrls.cpython-312.pyc -------------------------------------------------------------------------------- /app/__pycache__/texInference.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AstraBert/qdurllm/HEAD/app/__pycache__/texInference.cpython-312.pyc -------------------------------------------------------------------------------- /compose.yaml: -------------------------------------------------------------------------------- 1 | networks: 2 | mynet: 3 | driver: bridge 4 | 5 | services: 6 | qdrant: 7 | image: qdrant/qdrant 8 | ports: 9 | - "6333:6333" 10 | volumes: 11 | - "./qdrant_storage:/qdrant/storage" 12 | networks: 13 | - mynet 14 | 15 | -------------------------------------------------------------------------------- /docs/about.md: -------------------------------------------------------------------------------- 1 | # About `qdurllm` 2 | 3 | **`qdurllm`** is an open source local Gradio application that leverages: 4 | 5 | - semantic caching 6 | - sparse retrieval + reranking 7 | - LLM inference 8 | 9 | To allow you to upload website pages to a Qdrant database, search them and chat with them. 10 | 11 | > _Powered by [Qdrant](https://qdrant.tech), [FastEmbed](https://pypi.org/project/fastembed/), [SentenceTransformers](https://sbert.net/), [Nomic AI](https://www.nomic.ai/), [HuggingFace](https://huggingface.co), [Gradio](https://gradio.app), [Unstructured.io](https://unstructured.io/) and [LangChain](langchain.com)_ 12 | 13 | -------------------------------------------------------------------------------- /mkdocs.yaml: -------------------------------------------------------------------------------- 1 | site_name: qdurllm 2 | theme: 3 | name: material 4 | features: 5 | - navigation.instant 6 | - navigation.tracking 7 | - navigation.tabs 8 | - navigation.sections 9 | - navigation.expand 10 | - search.highlight 11 | - search.share 12 | palette: 13 | scheme: default 14 | primary: indigo 15 | accent: indigo 16 | plugins: 17 | - search 18 | - awesome-pages 19 | markdown_extensions: 20 | - pymdownx.highlight 21 | - pymdownx.superfences 22 | - pymdownx.inlinehilite 23 | - pymdownx.snippets 24 | - pymdownx.tasklist 25 | - admonition 26 | - toc 27 | - attr_list 28 | nav: 29 | - Home: index.md 30 | - About: about.md 31 | - Contributing: contributing.md -------------------------------------------------------------------------------- /app/loadUrls.py: -------------------------------------------------------------------------------- 1 | from langchain_community.document_loaders.url import UnstructuredURLLoader 2 | from langchain.text_splitter import CharacterTextSplitter 3 | from rag import upload_text_to_qdrant, client 4 | from typing import List, Dict 5 | 6 | def urlload(urls: str) -> List[Dict[str,str]]: 7 | links = urls.split(",") 8 | try: 9 | loader = UnstructuredURLLoader( 10 | urls=links, method="elements", 11 | strategy="fast" 12 | ) 13 | docs = loader.load() 14 | text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) 15 | pages = text_splitter.split_documents(docs) 16 | contents = [{"text": pages[i].page_content, "url": pages[i].metadata["source"]} for i in range(len(pages))] 17 | return contents 18 | except Exception as e: 19 | return f"An error occurred while parsing the URLs: {e}" 20 | 21 | 22 | def to_db(contents = List[Dict[str, str]]) -> None: 23 | c = 0 24 | for content in contents: 25 | upload_text_to_qdrant(client, "memory", content, c) 26 | c+=1 27 | return 28 | 29 | 30 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Clelia (Astra) Bertelli 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /app/texInference.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | warnings.filterwarnings('ignore') 3 | import torch 4 | from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig 5 | from trl import setup_chat_format 6 | from typing import Dict, List 7 | 8 | model_name = 'Qwen/Qwen2.5-1.5B-Instruct' 9 | quantization_config = BitsAndBytesConfig(load_in_4bit = True, bnb_4bit_compute_dtype = torch.bfloat16, bnb_4bit_use_double_quant = True, bnb_4bit_quant_type = 'nf4') 10 | device = 'cuda' if torch.cuda.is_available() else 'cpu' 11 | quantized_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype = torch.bfloat16, quantization_config = quantization_config).to(device) 12 | tokenizer = AutoTokenizer.from_pretrained(model_name) 13 | tokenizer.chat_template = None 14 | (quantized_model, tokenizer) = setup_chat_format(model = quantized_model, tokenizer = tokenizer) 15 | 16 | def pipe(prompt: List[Dict[(str, str)]], temperature: float, top_p: float, max_new_tokens: int, repetition_penalty: float) -> str: 17 | tokenized_chat = tokenizer.apply_chat_template(prompt, tokenize = True, add_generation_prompt = True, return_tensors = 'pt').to(device) 18 | outputs = quantized_model.generate(tokenized_chat, max_new_tokens = max_new_tokens, temperature = temperature, top_p = top_p, repetition_penalty = repetition_penalty).to(device) 19 | results = tokenizer.decode(outputs[0]) 20 | return results 21 | 22 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to `conda` 2 | 3 | Do you want to contribute to this project? Make sure to read this guidelines first :) 4 | 5 | ## Issue 6 | 7 | **When to do it**: 8 | 9 | - You found bugs but you don't know how to solve them or don't have time/will to do the solve 10 | - You want new features but you don't know how to implement them or don't have time/will to do the implementation 11 | 12 | > ⚠️ _Always check open and closed issues before you submit yours to avoid duplicates_ 13 | 14 | **How to do it**: 15 | 16 | - Open an issue 17 | - Give the issue a meaningful title (short but effective problem description) 18 | - Describe the problem following the issue template 19 | 20 | ## Traditional contribution 21 | 22 | **When to do it**: 23 | 24 | - You found bugs and corrected them 25 | - You optimized/improved the code 26 | - You added new features that you think could be useful to others 27 | 28 | **How to do it**: 29 | 30 | 1. Fork this repository 31 | 2. Commit your changes 32 | 3. Submit pull request (make sure to provide a thorough description of the changes) 33 | 34 | 35 | ## Showcase your PrAIvateSearch 36 | 37 | **When to do it**: 38 | 39 | - You modified the base application with new features but you don't want/can't merge them with the original PrAIvateSearch 40 | 41 | **How to do it**: 42 | 43 | - Go to [_GitHub Discussions > Show and tell_](https://github.com/AstraBert/PrAIvateSearch/discussions/categories/show-and-tell) page 44 | - Open a new discussion there, describing your PrAIvateSearch application 45 | 46 | ### Thanks for contributing! -------------------------------------------------------------------------------- /docs/contributing.md: -------------------------------------------------------------------------------- 1 | # Contributing to `qdurllm` 2 | 3 | Do you want to contribute to this project? Make sure to read this guidelines first :) 4 | 5 | ## Issue 6 | 7 | **When to do it**: 8 | 9 | - You found bugs but you don't know how to solve them or don't have time/will to do the solve 10 | - You want new features but you don't know how to implement them or don't have time/will to do the implementation 11 | 12 | > ⚠️ _Always check open and closed issues before you submit yours to avoid duplicates_ 13 | 14 | **How to do it**: 15 | 16 | - Open an issue 17 | - Give the issue a meaningful title (short but effective problem description) 18 | - Describe the problem following the issue template 19 | 20 | ## Traditional contribution 21 | 22 | **When to do it**: 23 | 24 | - You found bugs and corrected them 25 | - You optimized/improved the code 26 | - You added new features that you think could be useful to others 27 | 28 | **How to do it**: 29 | 30 | 1. Fork this repository 31 | 2. Commit your changes 32 | 3. Submit pull request (make sure to provide a thorough description of the changes) 33 | 34 | 35 | ## Showcase your PrAIvateSearch 36 | 37 | **When to do it**: 38 | 39 | - You modified the base application with new features but you don't want/can't merge them with the original PrAIvateSearch 40 | 41 | **How to do it**: 42 | 43 | - Go to [_GitHub Discussions > Show and tell_](https://github.com/AstraBert/PrAIvateSearch/discussions/categories/show-and-tell) page 44 | - Open a new discussion there, describing your PrAIvateSearch application 45 | 46 | ### Thanks for contributing! -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

qdurllm

2 |

Search your favorite websites and chat with them, on your desktop🌐

3 | 4 | # Docs in active development!👷‍♀️ 5 | 6 | They will be soon available on: https://astrabert.github.io/qdurllm/ 7 | 8 | In the meantime, refer to the **Quickstart guide** in this README! 9 | 10 | ## Quickstart 11 | 12 | ### 1. Prerequisites 13 | 14 | - [`conda`](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) package manager 15 | - [`docker`](https://www.docker.com/) and [`docker compose`](https://docs.docker.com/compose/). 16 | 17 | ### 2. Installation 18 | 19 | > [!IMPORTANT] 20 | > _This is only for the pre-release of `v1.0.0`, i.e. `v1.0.0-rc.0`_ 21 | 22 | 1. Clone the `january-2025` branch of this GitHub repo: 23 | 24 | ```bash 25 | git clone -b january-2025 --single-branch https://github.com/AstraBert/qdurllm.git 26 | cd qdurllm/ 27 | ``` 28 | 29 | 2. Create the `conda` environment: 30 | 31 | ```bash 32 | conda env create -f environment.yml 33 | ``` 34 | 35 | 3. Pull `qdrant` from Docker Hub: 36 | 37 | ```bash 38 | docker pull qdrant/qdrant 39 | ``` 40 | 41 | ### 3. Launching 42 | 43 | 1. Launch `qdrant` vector database services with `docker compose` (from within the `qdurllm` folder): 44 | 45 | ```bash 46 | docker compose up 47 | ``` 48 | 49 | 2. Activate the `qdurllm` conda environment you just created: 50 | 51 | ```bash 52 | conda activate qdurllm 53 | ``` 54 | 55 | 3. Go inside the `app` directory and launch the Gradio application: 56 | 57 | ```bash 58 | cd app/ 59 | python3 app.py 60 | ``` 61 | 62 | You should see the app running on `http://localhost:7860` once all the models are downloaded from HuggingFace Hub. 63 | 64 | ## Relies on 65 | 66 | - [Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct), with Apache 2.0 license 67 | - [nomic-ai/modernbert-embed-base](https://huggingface.co/nomic-ai/modernbert-embed-base), with Apache 2.0 license 68 | - [prithivida/Splade_PP_en_v1](https://huggingface.co/prithivida/Splade_PP_en_v1), with Apache 2.0 license 69 | 70 | 71 | ## Give feedback! 72 | 73 | Comment on the [**discussion thread created for this release**](https://github.com/AstraBert/qdurllm/discussions) with your feedback or create [**issues**](https://github.com/AstraBert/qdurllm/issues) :) 74 | 75 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 |

qdurllm

2 |

Search your favorite websites and chat with them, on your desktop🌐

3 | 4 | # Docs in active development!👷‍♀️ 5 | 6 | They will be soon available on: https://astrabert.github.io/qdurllm/ 7 | 8 | In the meantime, refer to the **Quickstart guide** in this README! 9 | 10 | ## Quickstart 11 | 12 | ### 1. Prerequisites 13 | 14 | - [`conda`](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) package manager 15 | - [`docker`](https://www.docker.com/) and [`docker compose`](https://docs.docker.com/compose/). 16 | 17 | ### 2. Installation 18 | 19 | > [!IMPORTANT] 20 | > _This is only for the pre-release of `v1.0.0`, i.e. `v1.0.0-rc.0`_ 21 | 22 | 1. Clone the `january-2025` branch of this GitHub repo: 23 | 24 | ```bash 25 | git clone -b january-2025 --single-branch https://github.com/AstraBert/qdurllm.git 26 | cd qdurllm/ 27 | ``` 28 | 29 | 2. Create the `conda` environment: 30 | 31 | ```bash 32 | conda env create -f environment.yml 33 | ``` 34 | 35 | 3. Pull `qdrant` from Docker Hub: 36 | 37 | ```bash 38 | docker pull qdrant/qdrant 39 | ``` 40 | 41 | ### 3. Launching 42 | 43 | 1. Launch `qdrant` vector database services with `docker compose` (from within the `qdurllm` folder): 44 | 45 | ```bash 46 | docker compose up 47 | ``` 48 | 49 | 2. Activate the `qdurllm` conda environment you just created: 50 | 51 | ```bash 52 | conda activate qdurllm 53 | ``` 54 | 55 | 3. Go inside the `app` directory and launch the Gradio application: 56 | 57 | ```bash 58 | cd app/ 59 | python3 app.py 60 | ``` 61 | 62 | You should see the app running on `http://localhost:7860` once all the models are downloaded from HuggingFace Hub. 63 | 64 | ## Relies on 65 | 66 | - [Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct), with Apache 2.0 license 67 | - [nomic-ai/modernbert-embed-base](https://huggingface.co/nomic-ai/modernbert-embed-base), with Apache 2.0 license 68 | - [prithivida/Splade_PP_en_v1](https://huggingface.co/prithivida/Splade_PP_en_v1), with Apache 2.0 license 69 | 70 | 71 | ## Give feedback! 72 | 73 | Comment on the [**discussion thread created for this release**](https://github.com/AstraBert/qdurllm/discussions) with your feedback or create [**issues**](https://github.com/AstraBert/qdurllm/issues) :) 74 | 75 | -------------------------------------------------------------------------------- /app/app.py: -------------------------------------------------------------------------------- 1 | from rag import client, SemanticCache, NeuralSearcher, dense_encoder, sparse_encoder 2 | from texInference import pipe 3 | from loadUrls import urlload, to_db 4 | import gradio as gr 5 | import time 6 | 7 | 8 | searcher = NeuralSearcher("memory", client, dense_encoder, sparse_encoder) 9 | semantic_cache = SemanticCache(client, dense_encoder, "semantic_cache") 10 | 11 | 12 | def upload2qdrant(url): 13 | global client 14 | documents = urlload(url) 15 | if type(documents) == list: 16 | try: 17 | to_db(documents) 18 | return "URLs successfully uploaded to Qdrant collection!" 19 | except Exception as e: 20 | return f"An error occured: {e}" 21 | else: 22 | return documents 23 | 24 | demo0 = gr.Interface(fn=upload2qdrant, title="Upload URL content to Qdrant", inputs=gr.Textbox(label="URL(s)", info="Add one URL or more (if more, you should provide them comma-separated, like this: URL1,URL2,...,URLn)"), outputs=gr.Textbox(label="Logs")) 25 | 26 | 27 | def reply(message, history, ntokens, rep_pen, temp, topp, systemins): 28 | sr = semantic_cache.search_cache(message) 29 | if sr: 30 | response = sr 31 | this_hist = '' 32 | for c in response: 33 | this_hist+=c 34 | time.sleep(0.001) 35 | yield this_hist 36 | else: 37 | context, url = searcher.search_text(message) 38 | prompt = [{"role": "system", "content": systemins}, {"role": "user", "content": f"This is the context information to reply to my prompt:\n\n{context}"}, {"role": "user", "content": message}] 39 | results = pipe(prompt, temp, topp, ntokens, rep_pen) 40 | results = results.split("<|im_start|>assistant\n")[1] 41 | response = results.replace("<|im_end|>", "") 42 | semantic_cache.upload_to_cache(message, response) 43 | this_hist = '' 44 | for c in response: 45 | this_hist+=c 46 | time.sleep(0.001) 47 | yield this_hist 48 | 49 | def direct_search(input_text): 50 | context, url = searcher.search_text(input_text) 51 | return context, f"Reference website [here]({url})" 52 | 53 | demo2 = gr.Interface(fn=direct_search, inputs=gr.Textbox(label="Search Query", placeholder="Input your search query here...", ), outputs=[gr.Textbox(label="Retrieved Content"), gr.Markdown(label="URL")], title="Search your URLs") 54 | 55 | user_max_new_tokens = gr.Slider(0, 4096, value=512, label="Max new tokens", info="Select max output tokens (higher number of tokens will result in a longer latency)") 56 | user_max_temperature = gr.Slider(0, 1, value=0.1, step=0.1, label="Temperature", info="Select generation temperature") 57 | user_max_rep_pen = gr.Slider(0, 10, value=1.2, step=0.1, label="Repetition penalty", info="Select repetition penalty") 58 | user_top_p = gr.Slider(0.1, 1, value=1, step=0.1, label="top_p", info="Select top_p for the generation") 59 | system_ins = gr.Textbox(label="System Prompt", info="Insert your system prompt here", value="You are an helpful web searching assistant. You reply based on the contextual information you are provided with and on your knowledge.") 60 | additional_accordion = gr.Accordion(label="Parameters to be set before you start chatting", open=True) 61 | demo1 = gr.ChatInterface(fn=reply, title="Chat with your URLs", additional_inputs=[user_max_new_tokens, user_max_temperature, user_max_rep_pen, user_top_p, system_ins], additional_inputs_accordion=additional_accordion) 62 | 63 | my_theme = gr.themes.Soft(primary_hue=gr.themes.colors.rose, secondary_hue=gr.themes.colors.pink) 64 | 65 | demo = gr.TabbedInterface([demo0, demo1, demo2], ["Upload URLs", "Chat with URLs", "Direct Search"], theme=my_theme) 66 | 67 | if __name__ == "__main__": 68 | demo.launch(server_name="0.0.0.0", server_port=7860) -------------------------------------------------------------------------------- /app/rag.py: -------------------------------------------------------------------------------- 1 | from qdrant_client import QdrantClient, models 2 | from fastembed import SparseTextEmbedding 3 | from sentence_transformers import SentenceTransformer 4 | import torch 5 | import uuid 6 | from typing import List, Dict 7 | 8 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 9 | 10 | dense_encoder = SentenceTransformer('nomic-ai/modernbert-embed-base').to(device) 11 | sparse_encoder = SparseTextEmbedding(model_name = 'prithivida/Splade_PP_en_v1') 12 | 13 | client = QdrantClient('http://localhost:6333') 14 | client.recreate_collection( 15 | collection_name = 'memory', 16 | vectors_config = {}, 17 | sparse_vectors_config = { 18 | 'sparse-text': models.SparseVectorParams(index = models.SparseIndexParams(on_disk = False)) 19 | }) 20 | client.recreate_collection( 21 | collection_name= 'semantic_cache', 22 | vectors_config = models.VectorParams(size = 768, distance = models.Distance.COSINE) 23 | ) 24 | 25 | def reranking(docs: List[str], query: str, dense_encoder: SentenceTransformer): 26 | query = "search_query: " + query 27 | docs = ["search_document: " + doc for doc in docs] 28 | query_vector = dense_encoder.encode(query) 29 | docs_vector = dense_encoder.encode(docs) 30 | similarities = dense_encoder.similarity(docs_vector, query_vector) 31 | sims = [float(sim[0]) for sim in similarities] 32 | text2sims = {docs[i]: sims[i] for i in range(len(sims))} 33 | sorted_items = sorted(text2sims.items(), key=lambda x: x[1], reverse=True) 34 | return sorted_items[0][0] 35 | 36 | 37 | def get_sparse_embedding(text: str, model: SparseTextEmbedding): 38 | embeddings = list(model.embed(text)) 39 | vector = {f"sparse-text": models.SparseVector(indices=embeddings[0].indices, values=embeddings[0].values)} 40 | return vector 41 | 42 | 43 | def get_query_sparse_embedding(text: str, model: SparseTextEmbedding): 44 | embeddings = list(model.embed(text)) 45 | query_vector = models.NamedSparseVector( 46 | name="sparse-text", 47 | vector=models.SparseVector( 48 | indices=embeddings[0].indices, 49 | values=embeddings[0].values, 50 | ), 51 | ) 52 | return query_vector 53 | 54 | 55 | def upload_text_to_qdrant(client: QdrantClient, collection_name: str, docs: dict, point_id_sparse: int): 56 | try: 57 | client.upsert( 58 | collection_name=collection_name, 59 | points=[ 60 | models.PointStruct( 61 | id=point_id_sparse, 62 | vector=get_sparse_embedding(docs["text"], sparse_encoder), 63 | payload=docs, 64 | ) 65 | ], 66 | ) 67 | return True 68 | except Exception as e: 69 | return False 70 | 71 | 72 | class SemanticCache: 73 | def __init__(self, client: QdrantClient, text_encoder: SentenceTransformer, collection_name: str, threshold: float = 0.75): 74 | self.client = client 75 | self.text_encoder = text_encoder 76 | self.collection_name = collection_name 77 | self.threshold = threshold 78 | def upload_to_cache(self, question = str, answer = str): 79 | docs = {"question": question, "answer": answer} 80 | tct = 'search_document: ' + question 81 | point_id = str(uuid.uuid4()) 82 | self.client.upsert( 83 | collection_name=self.collection_name, 84 | points=[ 85 | models.PointStruct( 86 | id=point_id, 87 | vector=self.text_encoder.encode(tct).tolist(), 88 | payload=docs, 89 | ) 90 | ], 91 | ) 92 | def search_cache(self, question: str, limit: int = 5): 93 | question = 'search_query: ' + question 94 | vector = self.text_encoder.encode(question).tolist() 95 | search_result = self.client.search( 96 | collection_name=self.collection_name, 97 | query_vector=vector, 98 | query_filter=None, 99 | limit=limit, 100 | ) 101 | payloads = [hit.payload["answer"] for hit in search_result if hit.score > self.threshold] 102 | if len(payloads) > 0: 103 | return payloads[0] 104 | else: 105 | return "" 106 | 107 | 108 | 109 | class NeuralSearcher: 110 | def __init__(self, text_collection_name: str, client: QdrantClient, dense_encoder: SentenceTransformer , sparse_encoder: SparseTextEmbedding): 111 | self.text_collection_name = text_collection_name 112 | self.dense_encoder = dense_encoder 113 | self.qdrant_client = client 114 | self.sparse_encoder = sparse_encoder 115 | def search_text(self, text: str, limit: int = 5): 116 | search_result_sparse = self.qdrant_client.search( 117 | collection_name=self.text_collection_name, 118 | query_vector=get_query_sparse_embedding(text, self.sparse_encoder), 119 | query_filter=None, 120 | limit=limit, 121 | ) 122 | payloads = [hit.payload["text"] for hit in search_result_sparse] 123 | urls = [hit.payload["url"] for hit in search_result_sparse] 124 | txt2url = {payloads[i]: urls[i] for i in range(len(urls))} 125 | context = reranking(payloads, text, self.dense_encoder) 126 | context = context.replace("search_document: ", "") 127 | return context, txt2url[context] 128 | 129 | 130 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: qdurllm 2 | channels: 3 | - pytorch 4 | - nvidia 5 | - conda-forge 6 | dependencies: 7 | - _libgcc_mutex=0.1=conda_forge 8 | - _openmp_mutex=4.5=2_kmp_llvm 9 | - aiohappyeyeballs=2.4.4=pyhd8ed1ab_1 10 | - aiohttp=3.11.11=py312h178313f_0 11 | - aiosignal=1.3.2=pyhd8ed1ab_0 12 | - annotated-types=0.7.0=pyhd8ed1ab_1 13 | - anyio=4.8.0=pyhd8ed1ab_0 14 | - async-timeout=4.0.3=pyhd8ed1ab_0 15 | - attrs=24.3.0=pyh71513ae_0 16 | - aws-c-auth=0.7.22=h96bc93b_2 17 | - aws-c-cal=0.6.14=h88a6e22_1 18 | - aws-c-common=0.9.19=h4ab18f5_0 19 | - aws-c-compression=0.2.18=h83b837d_6 20 | - aws-c-event-stream=0.4.2=ha47c788_12 21 | - aws-c-http=0.8.1=h29d6fba_17 22 | - aws-c-io=0.14.8=h21d4f22_5 23 | - aws-c-mqtt=0.10.4=h759edc4_4 24 | - aws-c-s3=0.5.9=h594631b_3 25 | - aws-c-sdkutils=0.1.16=h83b837d_2 26 | - aws-checksums=0.1.18=h83b837d_6 27 | - aws-crt-cpp=0.26.9=he3a8b3b_0 28 | - aws-sdk-cpp=1.11.329=hba8bd5f_3 29 | - backoff=2.2.1=pyhd8ed1ab_1 30 | - beautifulsoup4=4.12.3=pyha770c72_1 31 | - blas=2.116=mkl 32 | - blas-devel=3.9.0=16_linux64_mkl 33 | - brotli-python=1.1.0=py312h2ec8cdc_2 34 | - bzip2=1.0.8=h4bc722e_7 35 | - c-ares=1.34.4=hb9d3cd8_0 36 | - ca-certificates=2024.12.14=hbcca054_0 37 | - certifi=2024.2.2=pyhd8ed1ab_0 38 | - cffi=1.17.1=py312h06ac9bb_0 39 | - chardet=5.2.0=py312h7900ff3_2 40 | - charset-normalizer=3.3.2=pyhd8ed1ab_0 41 | - click=8.1.7=unix_pyh707e725_1 42 | - colorama=0.4.6=pyhd8ed1ab_1 43 | - cryptography=44.0.0=py312hda17c39_0 44 | - cuda-cudart=12.4.127=0 45 | - cuda-cupti=12.4.127=0 46 | - cuda-libraries=12.4.1=0 47 | - cuda-nvrtc=12.4.127=0 48 | - cuda-nvtx=12.4.127=0 49 | - cuda-opencl=12.6.77=0 50 | - cuda-runtime=12.4.1=0 51 | - cuda-version=12.6=3 52 | - dataclasses-json=0.6.7=pyhd8ed1ab_1 53 | - datasets=3.2.0=pyhd8ed1ab_0 54 | - deepdiff=8.1.1=pyhd8ed1ab_0 55 | - dill=0.3.8=pyhd8ed1ab_0 56 | - emoji=2.14.0=pyhd8ed1ab_1 57 | - eval-type-backport=0.2.2=pyhd8ed1ab_0 58 | - eval_type_backport=0.2.2=pyha770c72_0 59 | - exceptiongroup=1.2.2=pyhd8ed1ab_1 60 | - ffmpeg=4.3=hf484d3e_0 61 | - filelock=3.16.1=pyhd8ed1ab_1 62 | - filetype=1.2.0=pyhd8ed1ab_0 63 | - freetype=2.12.1=h267a509_2 64 | - frozenlist=1.5.0=py312h66e93f0_0 65 | - fsspec=2024.9.0=pyhff2d567_0 66 | - gflags=2.2.2=h5888daf_1005 67 | - giflib=5.2.2=hd590300_0 68 | - glog=0.7.1=hbabe93e_0 69 | - gmp=6.3.0=hac33072_2 70 | - gmpy2=2.1.5=py312h7201bc8_3 71 | - gnutls=3.6.13=h85f3911_1 72 | - greenlet=3.1.1=py312h2ec8cdc_1 73 | - h11=0.14.0=pyhd8ed1ab_1 74 | - h2=4.1.0=pyhd8ed1ab_1 75 | - hpack=4.0.0=pyhd8ed1ab_1 76 | - html5lib=1.1=pyhd8ed1ab_2 77 | - httpcore=1.0.7=pyh29332c3_1 78 | - httpx=0.28.1=pyhd8ed1ab_0 79 | - huggingface_hub=0.27.1=pyhd8ed1ab_0 80 | - hyperframe=6.0.1=pyhd8ed1ab_1 81 | - icu=73.2=h59595ed_0 82 | - idna=3.6=pyhd8ed1ab_0 83 | - jinja2=3.1.5=pyhd8ed1ab_0 84 | - joblib=1.3.2=pyhd8ed1ab_0 85 | - jsonpatch=1.33=pyhd8ed1ab_1 86 | - jsonpath-python=1.0.6=pyhff2d567_2 87 | - jsonpointer=3.0.0=py312h7900ff3_1 88 | - keyutils=1.6.1=h166bdaf_0 89 | - krb5=1.21.3=h659f571_0 90 | - lame=3.100=h166bdaf_1003 91 | - langchain=0.3.14=pyhd8ed1ab_0 92 | - langchain-core=0.3.29=pyhd8ed1ab_0 93 | - langchain-text-splitters=0.3.5=pyhd8ed1ab_0 94 | - langdetect=1.0.9=pyhd8ed1ab_1 95 | - langsmith=0.2.10=pyhd8ed1ab_0 96 | - lcms2=2.16=hb7c19ff_0 97 | - ld_impl_linux-64=2.43=h712a8e2_2 98 | - lerc=4.0.0=h27087fc_0 99 | - libabseil=20240116.2=cxx17_he02047a_1 100 | - libarrow=16.1.0=hcb6531f_6_cpu 101 | - libarrow-acero=16.1.0=hac33072_6_cpu 102 | - libarrow-dataset=16.1.0=hac33072_6_cpu 103 | - libarrow-substrait=16.1.0=h7e0c224_6_cpu 104 | - libblas=3.9.0=16_linux64_mkl 105 | - libbrotlicommon=1.1.0=hb9d3cd8_2 106 | - libbrotlidec=1.1.0=hb9d3cd8_2 107 | - libbrotlienc=1.1.0=hb9d3cd8_2 108 | - libcblas=3.9.0=16_linux64_mkl 109 | - libcrc32c=1.1.2=h9c3ff4c_0 110 | - libcublas=12.4.5.8=0 111 | - libcufft=11.2.1.3=0 112 | - libcufile=1.11.1.6=0 113 | - libcurand=10.3.7.77=0 114 | - libcurl=8.8.0=hca28451_1 115 | - libcusolver=11.6.1.9=0 116 | - libcusparse=12.3.1.170=0 117 | - libdeflate=1.20=hd590300_0 118 | - libedit=3.1.20240808=pl5321h7949ede_0 119 | - libev=4.33=hd590300_2 120 | - libevent=2.1.12=hf998b51_1 121 | - libexpat=2.6.4=h5888daf_0 122 | - libffi=3.4.2=h7f98852_5 123 | - libgcc=14.2.0=h77fa898_1 124 | - libgcc-ng=14.2.0=h69a702a_1 125 | - libgfortran=14.2.0=h69a702a_1 126 | - libgfortran-ng=14.2.0=h69a702a_1 127 | - libgfortran5=14.2.0=hd5240d6_1 128 | - libgoogle-cloud=2.24.0=h2736e30_0 129 | - libgoogle-cloud-storage=2.24.0=h3d9a0c8_0 130 | - libgrpc=1.62.2=h15f2491_0 131 | - libhwloc=2.11.2=default_he43201b_1000 132 | - libiconv=1.17=hd590300_2 133 | - libjpeg-turbo=3.0.0=hd590300_1 134 | - liblapack=3.9.0=16_linux64_mkl 135 | - liblapacke=3.9.0=16_linux64_mkl 136 | - liblzma=5.6.3=hb9d3cd8_1 137 | - liblzma-devel=5.6.3=hb9d3cd8_1 138 | - libmagic=5.39=h753d276_1 139 | - libnghttp2=1.58.0=h47da74e_1 140 | - libnpp=12.2.5.30=0 141 | - libnsl=2.0.1=hd590300_0 142 | - libnvfatbin=12.6.77=0 143 | - libnvjitlink=12.4.127=0 144 | - libnvjpeg=12.3.1.117=0 145 | - libparquet=16.1.0=h6a7eafb_6_cpu 146 | - libpng=1.6.43=h2797004_0 147 | - libprotobuf=4.25.3=h08a7969_0 148 | - libre2-11=2023.09.01=h5a48ba9_2 149 | - libsqlite=3.46.0=hde9e2c9_0 150 | - libssh2=1.11.0=h0841786_0 151 | - libstdcxx=14.2.0=hc0a3c3a_1 152 | - libstdcxx-ng=14.2.0=h4852527_1 153 | - libthrift=0.19.0=hb90f79a_1 154 | - libtiff=4.6.0=h1dd3fc0_3 155 | - libutf8proc=2.8.0=hf23e847_1 156 | - libuuid=2.38.1=h0b41bf4_0 157 | - libwebp=1.4.0=h2c329e2_0 158 | - libwebp-base=1.4.0=hd590300_0 159 | - libxcb=1.15=h0b41bf4_0 160 | - libxcrypt=4.4.36=hd590300_1 161 | - libxml2=2.12.7=hc051c1a_1 162 | - libxslt=1.1.39=h76b75d6_0 163 | - libzlib=1.2.13=h4ab18f5_6 164 | - llvm-openmp=15.0.7=h0cdce71_0 165 | - lxml=5.2.2=py312hb90d8a5_0 166 | - lz4-c=1.9.4=hcb278e6_0 167 | - marshmallow=3.20.2=pyhd8ed1ab_0 168 | - mkl=2022.1.0=h84fe81f_915 169 | - mkl-devel=2022.1.0=ha770c72_916 170 | - mkl-include=2022.1.0=h84fe81f_915 171 | - mpc=1.3.1=h24ddda3_1 172 | - mpfr=4.2.1=h90cbb55_3 173 | - mpmath=1.3.0=pyhd8ed1ab_1 174 | - multidict=6.1.0=py312h178313f_2 175 | - multiprocess=0.70.16=py312h66e93f0_1 176 | - mypy_extensions=1.0.0=pyha770c72_1 177 | - ncurses=6.5=h2d0b736_2 178 | - ndjson=0.3.1=pyhd8ed1ab_0 179 | - nest-asyncio=1.6.0=pyhd8ed1ab_1 180 | - nettle=3.6=he412f7d_0 181 | - networkx=3.4.2=pyh267e887_2 182 | - nltk=3.9.1=pyhd8ed1ab_1 183 | - numpy=1.26.4=py312heda63a1_0 184 | - olefile=0.47=pyhd8ed1ab_1 185 | - openh264=2.1.1=h780b84a_0 186 | - openjpeg=2.5.2=h488ebb8_0 187 | - openssl=3.4.0=h7b32b05_1 188 | - orc=2.0.1=h17fec99_1 189 | - orderly-set=5.2.3=pyh29332c3_1 190 | - orjson=3.10.14=py312h12e396e_0 191 | - packaging=23.2=pyhd8ed1ab_0 192 | - pandas=2.2.3=py312hf9745cd_1 193 | - pillow=10.3.0=py312hdcec9eb_0 194 | - pip=24.3.1=pyh8b19718_2 195 | - propcache=0.2.1=py312h66e93f0_0 196 | - psutil=6.1.1=py312h66e93f0_0 197 | - pthread-stubs=0.4=hb9d3cd8_1002 198 | - pyarrow=16.1.0=py312h8da182e_1 199 | - pyarrow-core=16.1.0=py312h5429d62_1_cpu 200 | - pycparser=2.22=pyh29332c3_1 201 | - pydantic=2.9.2=pyhd8ed1ab_0 202 | - pydantic-core=2.23.4=py312h12e396e_0 203 | - pypdf=5.1.0=pyha770c72_1 204 | - pysocks=1.7.1=pyha55dd90_7 205 | - python=3.12.3=hab00c5b_0_cpython 206 | - python-dateutil=2.8.2=pyhd8ed1ab_0 207 | - python-iso639=2024.10.22=pyhff2d567_1 208 | - python-magic=0.4.27=pyh9ac5cc3_5 209 | - python-oxmsg=0.0.1=pyhff2d567_1 210 | - python-tzdata=2024.2=pyhd8ed1ab_1 211 | - python-xxhash=3.5.0=py312h66e93f0_1 212 | - python_abi=3.12=5_cp312 213 | - pytorch=2.5.1=py3.12_cuda12.4_cudnn9.1.0_0 214 | - pytorch-cuda=12.4=hc786d27_7 215 | - pytorch-mutex=1.0=cuda 216 | - pytz=2024.1=pyhd8ed1ab_0 217 | - pyyaml=6.0.2=py312h66e93f0_1 218 | - rapidfuzz=3.11.0=py312h2ec8cdc_0 219 | - re2=2023.09.01=h7f4b329_2 220 | - readline=8.2=h8228510_1 221 | - regex=2023.12.25=py312h98912ed_0 222 | - requests=2.32.3=pyhd8ed1ab_1 223 | - requests-toolbelt=1.0.0=pyhd8ed1ab_1 224 | - s2n=1.4.15=he19d79f_0 225 | - safetensors=0.5.2=py312h12e396e_0 226 | - scikit-learn=1.6.1=py312h7a48858_0 227 | - scipy=1.15.1=py312h180e4f1_0 228 | - sentence-transformers=3.3.1=pyhd8ed1ab_1 229 | - setuptools=75.8.0=pyhff2d567_0 230 | - six=1.16.0=pyhd8ed1ab_1 231 | - snappy=1.2.1=h8bd8927_1 232 | - sniffio=1.3.1=pyhd8ed1ab_1 233 | - soupsieve=2.5=pyhd8ed1ab_1 234 | - sqlalchemy=2.0.37=py312h66e93f0_0 235 | - tabulate=0.9.0=pyhd8ed1ab_2 236 | - tbb=2021.13.0=hceb3a55_1 237 | - tenacity=9.0.0=pyhd8ed1ab_1 238 | - threadpoolctl=3.5.0=pyhc1e730c_0 239 | - tk=8.6.13=noxft_h4845f30_101 240 | - tokenizers=0.21.0=py312h8360d73_0 241 | - torchaudio=2.5.1=py312_cu124 242 | - torchtriton=3.1.0=py312 243 | - torchvision=0.20.1=py312_cu124 244 | - tqdm=4.67.1=pyhd8ed1ab_1 245 | - transformers=4.48.0=pyhd8ed1ab_0 246 | - typing-extensions=4.12.2=hd8ed1ab_1 247 | - typing_extensions=4.12.2=pyha770c72_1 248 | - typing_inspect=0.9.0=pyhd8ed1ab_1 249 | - tzdata=2024b=hc8b5060_0 250 | - unstructured=0.16.12=pyhd8ed1ab_0 251 | - unstructured-client=0.28.1=pyhd8ed1ab_1 252 | - urllib3=1.26.18=pyhd8ed1ab_0 253 | - webencodings=0.5.1=pyhd8ed1ab_3 254 | - wheel=0.45.1=pyhd8ed1ab_1 255 | - wrapt=1.17.1=py312h66e93f0_0 256 | - xorg-libxau=1.0.12=hb9d3cd8_0 257 | - xorg-libxdmcp=1.1.5=hb9d3cd8_0 258 | - xxhash=0.8.2=hd590300_0 259 | - xz=5.6.3=hbcc6ac9_1 260 | - xz-gpl-tools=5.6.3=hbcc6ac9_1 261 | - xz-tools=5.6.3=hb9d3cd8_1 262 | - yaml=0.2.5=h7f98852_2 263 | - yarl=1.18.3=py312h66e93f0_0 264 | - zlib=1.2.13=h4ab18f5_6 265 | - zstandard=0.23.0=py312hef9b889_1 266 | - zstd=1.5.6=ha6fb4c9_0 267 | - pip: 268 | - accelerate==1.2.1 269 | - aiofiles==23.2.1 270 | - bitsandbytes==0.45.0 271 | - coloredlogs==15.0.1 272 | - fastapi==0.115.6 273 | - fastembed==0.5.0 274 | - ffmpy==0.5.0 275 | - flatbuffers==24.12.23 276 | - gradio==5.12.0 277 | - gradio-client==1.5.4 278 | - grpcio==1.69.0 279 | - grpcio-tools==1.69.0 280 | - httpx-sse==0.4.0 281 | - humanfriendly==10.0 282 | - langchain-community==0.3.14 283 | - loguru==0.7.3 284 | - markdown-it-py==3.0.0 285 | - markupsafe==2.1.5 286 | - mdurl==0.1.2 287 | - mmh3==4.1.0 288 | - onnx==1.17.0 289 | - onnxruntime==1.20.1 290 | - portalocker==2.10.1 291 | - protobuf==5.29.3 292 | - py-rust-stemmers==0.1.3 293 | - pydantic-settings==2.7.1 294 | - pydub==0.25.1 295 | - pygments==2.19.1 296 | - python-dotenv==1.0.1 297 | - python-multipart==0.0.20 298 | - qdrant-client==1.12.2 299 | - rich==13.9.4 300 | - ruff==0.9.1 301 | - safehttpx==0.1.6 302 | - semantic-version==2.10.0 303 | - shellingham==1.5.4 304 | - starlette==0.41.3 305 | - sympy==1.13.1 306 | - tomlkit==0.13.2 307 | - trl==0.13.0 308 | - typer==0.15.1 309 | - uvicorn==0.34.0 310 | - websockets==14.1 311 | --------------------------------------------------------------------------------