├── .gitignore
├── imgs
    ├── qdurllm.png
    ├── tutorial1.png
    ├── tutorial2.png
    └── tutorial3.png
├── app
    ├── __pycache__
    │   ├── rag.cpython-312.pyc
    │   ├── loadUrls.cpython-312.pyc
    │   └── texInference.cpython-312.pyc
    ├── loadUrls.py
    ├── texInference.py
    ├── app.py
    └── rag.py
├── compose.yaml
├── docs
    ├── about.md
    ├── contributing.md
    └── index.md
├── mkdocs.yaml
├── LICENSE
├── CONTRIBUTING.md
├── README.md
└── environment.yml


/.gitignore:
--------------------------------------------------------------------------------
1 | */.cache
2 | application/flagged
3 | qdrant_storage/


--------------------------------------------------------------------------------
/imgs/qdurllm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AstraBert/qdurllm/HEAD/imgs/qdurllm.png


--------------------------------------------------------------------------------
/imgs/tutorial1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AstraBert/qdurllm/HEAD/imgs/tutorial1.png


--------------------------------------------------------------------------------
/imgs/tutorial2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AstraBert/qdurllm/HEAD/imgs/tutorial2.png


--------------------------------------------------------------------------------
/imgs/tutorial3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AstraBert/qdurllm/HEAD/imgs/tutorial3.png


--------------------------------------------------------------------------------
/app/__pycache__/rag.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AstraBert/qdurllm/HEAD/app/__pycache__/rag.cpython-312.pyc


--------------------------------------------------------------------------------
/app/__pycache__/loadUrls.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AstraBert/qdurllm/HEAD/app/__pycache__/loadUrls.cpython-312.pyc


--------------------------------------------------------------------------------
/app/__pycache__/texInference.cpython-312.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AstraBert/qdurllm/HEAD/app/__pycache__/texInference.cpython-312.pyc


--------------------------------------------------------------------------------
/compose.yaml:
--------------------------------------------------------------------------------
 1 | networks:
 2 |   mynet:
 3 |     driver: bridge
 4 | 
 5 | services:
 6 |   qdrant:
 7 |     image: qdrant/qdrant
 8 |     ports:
 9 |       - "6333:6333"
10 |     volumes:
11 |       - "./qdrant_storage:/qdrant/storage"
12 |     networks:
13 |       - mynet
14 |   
15 | 


--------------------------------------------------------------------------------
/docs/about.md:
--------------------------------------------------------------------------------
 1 | # About `qdurllm`
 2 | 
 3 | **`qdurllm`** is an open source local Gradio application that leverages:
 4 | 
 5 | - semantic caching
 6 | - sparse retrieval + reranking 
 7 | - LLM inference
 8 | 
 9 | To allow you to upload website pages to a Qdrant database, search them and chat with them.
10 | 
11 | > _Powered by [Qdrant](https://qdrant.tech), [FastEmbed](https://pypi.org/project/fastembed/), [SentenceTransformers](https://sbert.net/), [Nomic AI](https://www.nomic.ai/), [HuggingFace](https://huggingface.co), [Gradio](https://gradio.app), [Unstructured.io](https://unstructured.io/) and [LangChain](langchain.com)_
12 | 
13 | 


--------------------------------------------------------------------------------
/mkdocs.yaml:
--------------------------------------------------------------------------------
 1 | site_name: qdurllm
 2 | theme:
 3 |   name: material
 4 |   features:
 5 |   - navigation.instant
 6 |   - navigation.tracking
 7 |   - navigation.tabs
 8 |   - navigation.sections
 9 |   - navigation.expand
10 |   - search.highlight
11 |   - search.share
12 |   palette:
13 |     scheme: default
14 |     primary: indigo
15 |     accent: indigo
16 | plugins:
17 | - search
18 | - awesome-pages
19 | markdown_extensions:
20 | - pymdownx.highlight
21 | - pymdownx.superfences
22 | - pymdownx.inlinehilite
23 | - pymdownx.snippets
24 | - pymdownx.tasklist
25 | - admonition
26 | - toc
27 | - attr_list
28 | nav:
29 | - Home: index.md
30 | - About: about.md
31 | - Contributing: contributing.md


--------------------------------------------------------------------------------
/app/loadUrls.py:
--------------------------------------------------------------------------------
 1 | from langchain_community.document_loaders.url import UnstructuredURLLoader
 2 | from langchain.text_splitter import CharacterTextSplitter
 3 | from rag import upload_text_to_qdrant, client
 4 | from typing import List, Dict
 5 | 
 6 | def urlload(urls: str) -> List[Dict[str,str]]:
 7 |     links = urls.split(",")
 8 |     try:
 9 |         loader = UnstructuredURLLoader(
10 |             urls=links, method="elements", 
11 |             strategy="fast"
12 |         )
13 |         docs = loader.load()
14 |         text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
15 |         pages = text_splitter.split_documents(docs)
16 |         contents = [{"text": pages[i].page_content, "url": pages[i].metadata["source"]} for i in range(len(pages))]
17 |         return contents
18 |     except Exception as e:
19 |         return f"An error occurred while parsing the URLs: {e}"
20 | 
21 | 
22 | def to_db(contents = List[Dict[str, str]]) -> None:
23 |     c = 0
24 |     for content in contents:
25 |         upload_text_to_qdrant(client, "memory", content, c)
26 |         c+=1
27 |     return
28 |         
29 | 
30 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 Clelia (Astra) Bertelli
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/app/texInference.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | warnings.filterwarnings('ignore')
 3 | import torch
 4 | from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 5 | from trl import setup_chat_format
 6 | from typing import Dict, List
 7 | 
 8 | model_name = 'Qwen/Qwen2.5-1.5B-Instruct'
 9 | quantization_config = BitsAndBytesConfig(load_in_4bit = True, bnb_4bit_compute_dtype = torch.bfloat16, bnb_4bit_use_double_quant = True, bnb_4bit_quant_type = 'nf4')
10 | device = 'cuda' if torch.cuda.is_available() else 'cpu'
11 | quantized_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype = torch.bfloat16, quantization_config = quantization_config).to(device)
12 | tokenizer = AutoTokenizer.from_pretrained(model_name)
13 | tokenizer.chat_template = None
14 | (quantized_model, tokenizer) = setup_chat_format(model = quantized_model, tokenizer = tokenizer)
15 | 
16 | def pipe(prompt: List[Dict[(str, str)]], temperature: float, top_p: float, max_new_tokens: int, repetition_penalty: float) -> str:
17 |     tokenized_chat = tokenizer.apply_chat_template(prompt, tokenize = True, add_generation_prompt = True, return_tensors = 'pt').to(device)
18 |     outputs = quantized_model.generate(tokenized_chat, max_new_tokens = max_new_tokens, temperature = temperature, top_p = top_p, repetition_penalty = repetition_penalty).to(device)
19 |     results = tokenizer.decode(outputs[0])
20 |     return results
21 | 
22 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to `conda`
 2 | 
 3 | Do you want to contribute to this project? Make sure to read this guidelines first :)
 4 | 
 5 | ## Issue
 6 | 
 7 | **When to do it**:
 8 | 
 9 | - You found bugs but you don't know how to solve them or don't have time/will to do the solve
10 | - You want new features but you don't know how to implement them or don't have time/will to do the implementation
11 | 
12 | > ⚠️ _Always check open and closed issues before you submit yours to avoid duplicates_
13 | 
14 | **How to do it**:
15 | 
16 | - Open an issue
17 | - Give the issue a meaningful title (short but effective problem description)
18 | - Describe the problem following the issue template
19 | 
20 | ## Traditional contribution
21 | 
22 | **When to do it**:
23 | 
24 | - You found bugs and corrected them
25 | - You optimized/improved the code
26 | - You added new features that you think could be useful to others
27 | 
28 | **How to do it**:
29 | 
30 | 1. Fork this repository
31 | 2. Commit your changes
32 | 3. Submit pull request (make sure to provide a thorough description of the changes)
33 | 
34 | 
35 | ## Showcase your PrAIvateSearch
36 | 
37 | **When to do it**:
38 | 
39 | - You modified the base application with new features but you don't want/can't merge them with the original PrAIvateSearch
40 | 
41 | **How to do it**:
42 | 
43 | - Go to [_GitHub Discussions > Show and tell_](https://github.com/AstraBert/PrAIvateSearch/discussions/categories/show-and-tell) page
44 | - Open a new discussion there, describing your PrAIvateSearch application
45 | 
46 | ### Thanks for contributing!


--------------------------------------------------------------------------------
/docs/contributing.md:
--------------------------------------------------------------------------------
 1 | # Contributing to `qdurllm`
 2 | 
 3 | Do you want to contribute to this project? Make sure to read this guidelines first :)
 4 | 
 5 | ## Issue
 6 | 
 7 | **When to do it**:
 8 | 
 9 | - You found bugs but you don't know how to solve them or don't have time/will to do the solve
10 | - You want new features but you don't know how to implement them or don't have time/will to do the implementation
11 | 
12 | > ⚠️ _Always check open and closed issues before you submit yours to avoid duplicates_
13 | 
14 | **How to do it**:
15 | 
16 | - Open an issue
17 | - Give the issue a meaningful title (short but effective problem description)
18 | - Describe the problem following the issue template
19 | 
20 | ## Traditional contribution
21 | 
22 | **When to do it**:
23 | 
24 | - You found bugs and corrected them
25 | - You optimized/improved the code
26 | - You added new features that you think could be useful to others
27 | 
28 | **How to do it**:
29 | 
30 | 1. Fork this repository
31 | 2. Commit your changes
32 | 3. Submit pull request (make sure to provide a thorough description of the changes)
33 | 
34 | 
35 | ## Showcase your PrAIvateSearch
36 | 
37 | **When to do it**:
38 | 
39 | - You modified the base application with new features but you don't want/can't merge them with the original PrAIvateSearch
40 | 
41 | **How to do it**:
42 | 
43 | - Go to [_GitHub Discussions > Show and tell_](https://github.com/AstraBert/PrAIvateSearch/discussions/categories/show-and-tell) page
44 | - Open a new discussion there, describing your PrAIvateSearch application
45 | 
46 | ### Thanks for contributing!


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <h1 align="center">qdurllm</h1>
 2 | <h2 align="center">Search your favorite websites and chat with them, on your desktop🌐</h2>
 3 | 
 4 | # Docs in active development!👷‍♀️
 5 | 
 6 | They will be soon available on: https://astrabert.github.io/qdurllm/
 7 | 
 8 | In the meantime, refer to the **Quickstart guide** in this README!
 9 | 
10 | ## Quickstart
11 | 
12 | ### 1. Prerequisites
13 | 
14 | - [`conda`](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) package manager
15 | - [`docker`](https://www.docker.com/) and [`docker compose`](https://docs.docker.com/compose/).
16 | 
17 | ### 2. Installation
18 | 
19 | > [!IMPORTANT]
20 | > _This is only for the pre-release of `v1.0.0`, i.e. `v1.0.0-rc.0`_
21 | 
22 | 1. Clone the `january-2025` branch of this GitHub repo:
23 | 
24 | ```bash
25 | git clone -b january-2025 --single-branch https://github.com/AstraBert/qdurllm.git
26 | cd qdurllm/
27 | ```
28 | 
29 | 2. Create the `conda` environment:
30 | 
31 | ```bash
32 | conda env create -f environment.yml
33 | ```
34 | 
35 | 3. Pull `qdrant` from Docker Hub:
36 | 
37 | ```bash
38 | docker pull qdrant/qdrant
39 | ```
40 | 
41 | ### 3. Launching
42 | 
43 | 1. Launch `qdrant` vector database services with `docker compose` (from within the `qdurllm` folder):
44 | 
45 | ```bash
46 | docker compose up
47 | ```
48 | 
49 | 2. Activate the `qdurllm` conda environment you just created:
50 | 
51 | ```bash
52 | conda activate qdurllm
53 | ```
54 | 
55 | 3. Go inside the `app` directory and launch the Gradio application:
56 | 
57 | ```bash
58 | cd app/
59 | python3 app.py
60 | ```
61 | 
62 | You should see the app running on `http://localhost:7860` once all the models are downloaded from HuggingFace Hub.
63 | 
64 | ## Relies on
65 | 
66 | - [Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct), with Apache 2.0 license
67 | - [nomic-ai/modernbert-embed-base](https://huggingface.co/nomic-ai/modernbert-embed-base), with Apache 2.0 license
68 | - [prithivida/Splade_PP_en_v1](https://huggingface.co/prithivida/Splade_PP_en_v1), with Apache 2.0 license
69 | 
70 | 
71 | ## Give feedback!
72 | 
73 | Comment on the [**discussion thread created for this release**](https://github.com/AstraBert/qdurllm/discussions) with your feedback or create [**issues**](https://github.com/AstraBert/qdurllm/issues) :)
74 | 
75 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
 1 | <h1 align="center">qdurllm</h1>
 2 | <h2 align="center">Search your favorite websites and chat with them, on your desktop🌐</h2>
 3 | 
 4 | # Docs in active development!👷‍♀️
 5 | 
 6 | They will be soon available on: https://astrabert.github.io/qdurllm/
 7 | 
 8 | In the meantime, refer to the **Quickstart guide** in this README!
 9 | 
10 | ## Quickstart
11 | 
12 | ### 1. Prerequisites
13 | 
14 | - [`conda`](https://docs.conda.io/projects/conda/en/latest/user-guide/getting-started.html) package manager
15 | - [`docker`](https://www.docker.com/) and [`docker compose`](https://docs.docker.com/compose/).
16 | 
17 | ### 2. Installation
18 | 
19 | > [!IMPORTANT]
20 | > _This is only for the pre-release of `v1.0.0`, i.e. `v1.0.0-rc.0`_
21 | 
22 | 1. Clone the `january-2025` branch of this GitHub repo:
23 | 
24 | ```bash
25 | git clone -b january-2025 --single-branch https://github.com/AstraBert/qdurllm.git
26 | cd qdurllm/
27 | ```
28 | 
29 | 2. Create the `conda` environment:
30 | 
31 | ```bash
32 | conda env create -f environment.yml
33 | ```
34 | 
35 | 3. Pull `qdrant` from Docker Hub:
36 | 
37 | ```bash
38 | docker pull qdrant/qdrant
39 | ```
40 | 
41 | ### 3. Launching
42 | 
43 | 1. Launch `qdrant` vector database services with `docker compose` (from within the `qdurllm` folder):
44 | 
45 | ```bash
46 | docker compose up
47 | ```
48 | 
49 | 2. Activate the `qdurllm` conda environment you just created:
50 | 
51 | ```bash
52 | conda activate qdurllm
53 | ```
54 | 
55 | 3. Go inside the `app` directory and launch the Gradio application:
56 | 
57 | ```bash
58 | cd app/
59 | python3 app.py
60 | ```
61 | 
62 | You should see the app running on `http://localhost:7860` once all the models are downloaded from HuggingFace Hub.
63 | 
64 | ## Relies on
65 | 
66 | - [Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct), with Apache 2.0 license
67 | - [nomic-ai/modernbert-embed-base](https://huggingface.co/nomic-ai/modernbert-embed-base), with Apache 2.0 license
68 | - [prithivida/Splade_PP_en_v1](https://huggingface.co/prithivida/Splade_PP_en_v1), with Apache 2.0 license
69 | 
70 | 
71 | ## Give feedback!
72 | 
73 | Comment on the [**discussion thread created for this release**](https://github.com/AstraBert/qdurllm/discussions) with your feedback or create [**issues**](https://github.com/AstraBert/qdurllm/issues) :)
74 | 
75 | 


--------------------------------------------------------------------------------
/app/app.py:
--------------------------------------------------------------------------------
 1 | from rag import client, SemanticCache, NeuralSearcher, dense_encoder, sparse_encoder
 2 | from texInference import pipe
 3 | from loadUrls import urlload, to_db
 4 | import gradio as gr
 5 | import time
 6 | 
 7 | 
 8 | searcher = NeuralSearcher("memory", client, dense_encoder, sparse_encoder)
 9 | semantic_cache = SemanticCache(client, dense_encoder, "semantic_cache")
10 | 
11 | 
12 | def upload2qdrant(url):
13 |     global client
14 |     documents = urlload(url)
15 |     if type(documents) == list:
16 |         try:
17 |             to_db(documents)
18 |             return "URLs successfully uploaded to Qdrant collection!"
19 |         except Exception as e:
20 |             return f"An error occured: {e}"
21 |     else:
22 |         return documents
23 | 
24 | demo0 = gr.Interface(fn=upload2qdrant, title="Upload URL content to Qdrant", inputs=gr.Textbox(label="URL(s)", info="Add one URL or more (if more, you should provide them comma-separated, like this: URL1,URL2,...,URLn)"), outputs=gr.Textbox(label="Logs"))
25 | 
26 | 
27 | def reply(message, history, ntokens, rep_pen, temp, topp, systemins):
28 |     sr = semantic_cache.search_cache(message)
29 |     if sr:
30 |         response = sr
31 |         this_hist = ''
32 |         for c in response:
33 |             this_hist+=c
34 |             time.sleep(0.001)
35 |             yield this_hist
36 |     else:
37 |         context, url = searcher.search_text(message)
38 |         prompt = [{"role": "system", "content": systemins}, {"role": "user", "content": f"This is the context information to reply to my prompt:\n\n{context}"}, {"role": "user", "content": message}]
39 |         results = pipe(prompt, temp, topp, ntokens, rep_pen)
40 |         results = results.split("<|im_start|>assistant\n")[1]
41 |         response = results.replace("<|im_end|>", "")
42 |         semantic_cache.upload_to_cache(message, response)
43 |         this_hist = ''
44 |         for c in response:
45 |             this_hist+=c
46 |             time.sleep(0.001)
47 |             yield this_hist
48 | 
49 | def direct_search(input_text):
50 |     context, url = searcher.search_text(input_text)
51 |     return context, f"Reference website [here]({url})"
52 | 
53 | demo2 = gr.Interface(fn=direct_search, inputs=gr.Textbox(label="Search Query", placeholder="Input your search query here...", ), outputs=[gr.Textbox(label="Retrieved Content"), gr.Markdown(label="URL")], title="Search your URLs")
54 | 
55 | user_max_new_tokens = gr.Slider(0, 4096, value=512, label="Max new tokens", info="Select max output tokens (higher number of tokens will result in a longer latency)")
56 | user_max_temperature = gr.Slider(0, 1, value=0.1, step=0.1, label="Temperature", info="Select generation temperature")
57 | user_max_rep_pen = gr.Slider(0, 10, value=1.2, step=0.1, label="Repetition penalty", info="Select repetition penalty")
58 | user_top_p = gr.Slider(0.1, 1, value=1, step=0.1, label="top_p", info="Select top_p for the generation")
59 | system_ins = gr.Textbox(label="System Prompt", info="Insert your system prompt here", value="You are an helpful web searching assistant. You reply based on the contextual information you are provided with and on your knowledge.")
60 | additional_accordion = gr.Accordion(label="Parameters to be set before you start chatting", open=True)
61 | demo1 = gr.ChatInterface(fn=reply, title="Chat with your URLs", additional_inputs=[user_max_new_tokens, user_max_temperature, user_max_rep_pen, user_top_p, system_ins], additional_inputs_accordion=additional_accordion)
62 | 
63 | my_theme = gr.themes.Soft(primary_hue=gr.themes.colors.rose, secondary_hue=gr.themes.colors.pink)
64 | 
65 | demo = gr.TabbedInterface([demo0, demo1, demo2], ["Upload URLs", "Chat with URLs", "Direct Search"], theme=my_theme)
66 | 
67 | if __name__ == "__main__":
68 |     demo.launch(server_name="0.0.0.0", server_port=7860)


--------------------------------------------------------------------------------
/app/rag.py:
--------------------------------------------------------------------------------
  1 | from qdrant_client import QdrantClient, models
  2 | from fastembed import SparseTextEmbedding
  3 | from sentence_transformers import SentenceTransformer
  4 | import torch
  5 | import uuid
  6 | from typing import List, Dict
  7 | 
  8 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  9 | 
 10 | dense_encoder = SentenceTransformer('nomic-ai/modernbert-embed-base').to(device)
 11 | sparse_encoder = SparseTextEmbedding(model_name = 'prithivida/Splade_PP_en_v1')
 12 | 
 13 | client = QdrantClient('http://localhost:6333')
 14 | client.recreate_collection(
 15 |     collection_name = 'memory', 
 16 |     vectors_config = {}, 
 17 |     sparse_vectors_config = {
 18 |         'sparse-text': models.SparseVectorParams(index = models.SparseIndexParams(on_disk = False))
 19 |     })
 20 | client.recreate_collection(
 21 |     collection_name= 'semantic_cache', 
 22 |     vectors_config = models.VectorParams(size = 768, distance = models.Distance.COSINE)
 23 | )
 24 | 
 25 | def reranking(docs: List[str], query: str, dense_encoder: SentenceTransformer):
 26 |     query = "search_query: " + query
 27 |     docs = ["search_document: " + doc for doc in docs]
 28 |     query_vector = dense_encoder.encode(query)
 29 |     docs_vector = dense_encoder.encode(docs)
 30 |     similarities = dense_encoder.similarity(docs_vector, query_vector)
 31 |     sims = [float(sim[0]) for sim in similarities]
 32 |     text2sims = {docs[i]: sims[i] for i in range(len(sims))}
 33 |     sorted_items = sorted(text2sims.items(), key=lambda x: x[1], reverse=True)
 34 |     return sorted_items[0][0]
 35 | 
 36 | 
 37 | def get_sparse_embedding(text: str, model: SparseTextEmbedding):
 38 |     embeddings = list(model.embed(text))
 39 |     vector = {f"sparse-text": models.SparseVector(indices=embeddings[0].indices, values=embeddings[0].values)}
 40 |     return vector
 41 | 
 42 | 
 43 | def get_query_sparse_embedding(text: str, model: SparseTextEmbedding):
 44 |     embeddings = list(model.embed(text))
 45 |     query_vector = models.NamedSparseVector(
 46 |         name="sparse-text",
 47 |         vector=models.SparseVector(
 48 |             indices=embeddings[0].indices,
 49 |             values=embeddings[0].values,
 50 |         ),
 51 |     )
 52 |     return query_vector
 53 | 
 54 | 
 55 | def upload_text_to_qdrant(client: QdrantClient, collection_name: str, docs: dict, point_id_sparse: int):
 56 |     try:
 57 |         client.upsert(
 58 |             collection_name=collection_name,
 59 |             points=[
 60 |                 models.PointStruct(
 61 |                     id=point_id_sparse,
 62 |                     vector=get_sparse_embedding(docs["text"], sparse_encoder),
 63 |                     payload=docs,
 64 |                 )
 65 |             ],
 66 |         )
 67 |         return True
 68 |     except Exception as e:
 69 |         return False
 70 | 
 71 | 
 72 | class SemanticCache:
 73 |     def __init__(self, client: QdrantClient, text_encoder: SentenceTransformer, collection_name: str, threshold: float = 0.75):
 74 |         self.client = client
 75 |         self.text_encoder = text_encoder
 76 |         self.collection_name = collection_name
 77 |         self.threshold = threshold
 78 |     def upload_to_cache(self, question = str, answer = str):
 79 |         docs = {"question": question, "answer": answer}
 80 |         tct = 'search_document: ' + question
 81 |         point_id = str(uuid.uuid4())
 82 |         self.client.upsert(
 83 |             collection_name=self.collection_name,
 84 |             points=[
 85 |                 models.PointStruct(
 86 |                     id=point_id,
 87 |                     vector=self.text_encoder.encode(tct).tolist(),
 88 |                     payload=docs,
 89 |                 )
 90 |             ],
 91 |         )
 92 |     def search_cache(self, question: str, limit: int = 5):
 93 |         question = 'search_query: ' + question
 94 |         vector = self.text_encoder.encode(question).tolist()
 95 |         search_result = self.client.search(
 96 |             collection_name=self.collection_name,
 97 |             query_vector=vector,
 98 |             query_filter=None,
 99 |             limit=limit,
100 |         )
101 |         payloads = [hit.payload["answer"] for hit in search_result if hit.score > self.threshold]
102 |         if len(payloads) > 0:
103 |             return payloads[0]
104 |         else:
105 |             return ""
106 | 
107 | 
108 | 
109 | class NeuralSearcher:
110 |     def __init__(self, text_collection_name: str, client: QdrantClient, dense_encoder: SentenceTransformer , sparse_encoder: SparseTextEmbedding):
111 |         self.text_collection_name = text_collection_name
112 |         self.dense_encoder = dense_encoder
113 |         self.qdrant_client = client
114 |         self.sparse_encoder = sparse_encoder
115 |     def search_text(self, text: str, limit: int = 5):
116 |         search_result_sparse = self.qdrant_client.search(
117 |             collection_name=self.text_collection_name,
118 |             query_vector=get_query_sparse_embedding(text, self.sparse_encoder),
119 |             query_filter=None,
120 |             limit=limit,
121 |         )
122 |         payloads = [hit.payload["text"] for hit in search_result_sparse]
123 |         urls = [hit.payload["url"] for hit in search_result_sparse]
124 |         txt2url = {payloads[i]: urls[i] for i in range(len(urls))}
125 |         context = reranking(payloads, text, self.dense_encoder)
126 |         context = context.replace("search_document: ", "")
127 |         return context, txt2url[context]
128 | 
129 | 
130 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
  1 | name: qdurllm
  2 | channels:
  3 |   - pytorch
  4 |   - nvidia
  5 |   - conda-forge
  6 | dependencies:
  7 |   - _libgcc_mutex=0.1=conda_forge
  8 |   - _openmp_mutex=4.5=2_kmp_llvm
  9 |   - aiohappyeyeballs=2.4.4=pyhd8ed1ab_1
 10 |   - aiohttp=3.11.11=py312h178313f_0
 11 |   - aiosignal=1.3.2=pyhd8ed1ab_0
 12 |   - annotated-types=0.7.0=pyhd8ed1ab_1
 13 |   - anyio=4.8.0=pyhd8ed1ab_0
 14 |   - async-timeout=4.0.3=pyhd8ed1ab_0
 15 |   - attrs=24.3.0=pyh71513ae_0
 16 |   - aws-c-auth=0.7.22=h96bc93b_2
 17 |   - aws-c-cal=0.6.14=h88a6e22_1
 18 |   - aws-c-common=0.9.19=h4ab18f5_0
 19 |   - aws-c-compression=0.2.18=h83b837d_6
 20 |   - aws-c-event-stream=0.4.2=ha47c788_12
 21 |   - aws-c-http=0.8.1=h29d6fba_17
 22 |   - aws-c-io=0.14.8=h21d4f22_5
 23 |   - aws-c-mqtt=0.10.4=h759edc4_4
 24 |   - aws-c-s3=0.5.9=h594631b_3
 25 |   - aws-c-sdkutils=0.1.16=h83b837d_2
 26 |   - aws-checksums=0.1.18=h83b837d_6
 27 |   - aws-crt-cpp=0.26.9=he3a8b3b_0
 28 |   - aws-sdk-cpp=1.11.329=hba8bd5f_3
 29 |   - backoff=2.2.1=pyhd8ed1ab_1
 30 |   - beautifulsoup4=4.12.3=pyha770c72_1
 31 |   - blas=2.116=mkl
 32 |   - blas-devel=3.9.0=16_linux64_mkl
 33 |   - brotli-python=1.1.0=py312h2ec8cdc_2
 34 |   - bzip2=1.0.8=h4bc722e_7
 35 |   - c-ares=1.34.4=hb9d3cd8_0
 36 |   - ca-certificates=2024.12.14=hbcca054_0
 37 |   - certifi=2024.2.2=pyhd8ed1ab_0
 38 |   - cffi=1.17.1=py312h06ac9bb_0
 39 |   - chardet=5.2.0=py312h7900ff3_2
 40 |   - charset-normalizer=3.3.2=pyhd8ed1ab_0
 41 |   - click=8.1.7=unix_pyh707e725_1
 42 |   - colorama=0.4.6=pyhd8ed1ab_1
 43 |   - cryptography=44.0.0=py312hda17c39_0
 44 |   - cuda-cudart=12.4.127=0
 45 |   - cuda-cupti=12.4.127=0
 46 |   - cuda-libraries=12.4.1=0
 47 |   - cuda-nvrtc=12.4.127=0
 48 |   - cuda-nvtx=12.4.127=0
 49 |   - cuda-opencl=12.6.77=0
 50 |   - cuda-runtime=12.4.1=0
 51 |   - cuda-version=12.6=3
 52 |   - dataclasses-json=0.6.7=pyhd8ed1ab_1
 53 |   - datasets=3.2.0=pyhd8ed1ab_0
 54 |   - deepdiff=8.1.1=pyhd8ed1ab_0
 55 |   - dill=0.3.8=pyhd8ed1ab_0
 56 |   - emoji=2.14.0=pyhd8ed1ab_1
 57 |   - eval-type-backport=0.2.2=pyhd8ed1ab_0
 58 |   - eval_type_backport=0.2.2=pyha770c72_0
 59 |   - exceptiongroup=1.2.2=pyhd8ed1ab_1
 60 |   - ffmpeg=4.3=hf484d3e_0
 61 |   - filelock=3.16.1=pyhd8ed1ab_1
 62 |   - filetype=1.2.0=pyhd8ed1ab_0
 63 |   - freetype=2.12.1=h267a509_2
 64 |   - frozenlist=1.5.0=py312h66e93f0_0
 65 |   - fsspec=2024.9.0=pyhff2d567_0
 66 |   - gflags=2.2.2=h5888daf_1005
 67 |   - giflib=5.2.2=hd590300_0
 68 |   - glog=0.7.1=hbabe93e_0
 69 |   - gmp=6.3.0=hac33072_2
 70 |   - gmpy2=2.1.5=py312h7201bc8_3
 71 |   - gnutls=3.6.13=h85f3911_1
 72 |   - greenlet=3.1.1=py312h2ec8cdc_1
 73 |   - h11=0.14.0=pyhd8ed1ab_1
 74 |   - h2=4.1.0=pyhd8ed1ab_1
 75 |   - hpack=4.0.0=pyhd8ed1ab_1
 76 |   - html5lib=1.1=pyhd8ed1ab_2
 77 |   - httpcore=1.0.7=pyh29332c3_1
 78 |   - httpx=0.28.1=pyhd8ed1ab_0
 79 |   - huggingface_hub=0.27.1=pyhd8ed1ab_0
 80 |   - hyperframe=6.0.1=pyhd8ed1ab_1
 81 |   - icu=73.2=h59595ed_0
 82 |   - idna=3.6=pyhd8ed1ab_0
 83 |   - jinja2=3.1.5=pyhd8ed1ab_0
 84 |   - joblib=1.3.2=pyhd8ed1ab_0
 85 |   - jsonpatch=1.33=pyhd8ed1ab_1
 86 |   - jsonpath-python=1.0.6=pyhff2d567_2
 87 |   - jsonpointer=3.0.0=py312h7900ff3_1
 88 |   - keyutils=1.6.1=h166bdaf_0
 89 |   - krb5=1.21.3=h659f571_0
 90 |   - lame=3.100=h166bdaf_1003
 91 |   - langchain=0.3.14=pyhd8ed1ab_0
 92 |   - langchain-core=0.3.29=pyhd8ed1ab_0
 93 |   - langchain-text-splitters=0.3.5=pyhd8ed1ab_0
 94 |   - langdetect=1.0.9=pyhd8ed1ab_1
 95 |   - langsmith=0.2.10=pyhd8ed1ab_0
 96 |   - lcms2=2.16=hb7c19ff_0
 97 |   - ld_impl_linux-64=2.43=h712a8e2_2
 98 |   - lerc=4.0.0=h27087fc_0
 99 |   - libabseil=20240116.2=cxx17_he02047a_1
100 |   - libarrow=16.1.0=hcb6531f_6_cpu
101 |   - libarrow-acero=16.1.0=hac33072_6_cpu
102 |   - libarrow-dataset=16.1.0=hac33072_6_cpu
103 |   - libarrow-substrait=16.1.0=h7e0c224_6_cpu
104 |   - libblas=3.9.0=16_linux64_mkl
105 |   - libbrotlicommon=1.1.0=hb9d3cd8_2
106 |   - libbrotlidec=1.1.0=hb9d3cd8_2
107 |   - libbrotlienc=1.1.0=hb9d3cd8_2
108 |   - libcblas=3.9.0=16_linux64_mkl
109 |   - libcrc32c=1.1.2=h9c3ff4c_0
110 |   - libcublas=12.4.5.8=0
111 |   - libcufft=11.2.1.3=0
112 |   - libcufile=1.11.1.6=0
113 |   - libcurand=10.3.7.77=0
114 |   - libcurl=8.8.0=hca28451_1
115 |   - libcusolver=11.6.1.9=0
116 |   - libcusparse=12.3.1.170=0
117 |   - libdeflate=1.20=hd590300_0
118 |   - libedit=3.1.20240808=pl5321h7949ede_0
119 |   - libev=4.33=hd590300_2
120 |   - libevent=2.1.12=hf998b51_1
121 |   - libexpat=2.6.4=h5888daf_0
122 |   - libffi=3.4.2=h7f98852_5
123 |   - libgcc=14.2.0=h77fa898_1
124 |   - libgcc-ng=14.2.0=h69a702a_1
125 |   - libgfortran=14.2.0=h69a702a_1
126 |   - libgfortran-ng=14.2.0=h69a702a_1
127 |   - libgfortran5=14.2.0=hd5240d6_1
128 |   - libgoogle-cloud=2.24.0=h2736e30_0
129 |   - libgoogle-cloud-storage=2.24.0=h3d9a0c8_0
130 |   - libgrpc=1.62.2=h15f2491_0
131 |   - libhwloc=2.11.2=default_he43201b_1000
132 |   - libiconv=1.17=hd590300_2
133 |   - libjpeg-turbo=3.0.0=hd590300_1
134 |   - liblapack=3.9.0=16_linux64_mkl
135 |   - liblapacke=3.9.0=16_linux64_mkl
136 |   - liblzma=5.6.3=hb9d3cd8_1
137 |   - liblzma-devel=5.6.3=hb9d3cd8_1
138 |   - libmagic=5.39=h753d276_1
139 |   - libnghttp2=1.58.0=h47da74e_1
140 |   - libnpp=12.2.5.30=0
141 |   - libnsl=2.0.1=hd590300_0
142 |   - libnvfatbin=12.6.77=0
143 |   - libnvjitlink=12.4.127=0
144 |   - libnvjpeg=12.3.1.117=0
145 |   - libparquet=16.1.0=h6a7eafb_6_cpu
146 |   - libpng=1.6.43=h2797004_0
147 |   - libprotobuf=4.25.3=h08a7969_0
148 |   - libre2-11=2023.09.01=h5a48ba9_2
149 |   - libsqlite=3.46.0=hde9e2c9_0
150 |   - libssh2=1.11.0=h0841786_0
151 |   - libstdcxx=14.2.0=hc0a3c3a_1
152 |   - libstdcxx-ng=14.2.0=h4852527_1
153 |   - libthrift=0.19.0=hb90f79a_1
154 |   - libtiff=4.6.0=h1dd3fc0_3
155 |   - libutf8proc=2.8.0=hf23e847_1
156 |   - libuuid=2.38.1=h0b41bf4_0
157 |   - libwebp=1.4.0=h2c329e2_0
158 |   - libwebp-base=1.4.0=hd590300_0
159 |   - libxcb=1.15=h0b41bf4_0
160 |   - libxcrypt=4.4.36=hd590300_1
161 |   - libxml2=2.12.7=hc051c1a_1
162 |   - libxslt=1.1.39=h76b75d6_0
163 |   - libzlib=1.2.13=h4ab18f5_6
164 |   - llvm-openmp=15.0.7=h0cdce71_0
165 |   - lxml=5.2.2=py312hb90d8a5_0
166 |   - lz4-c=1.9.4=hcb278e6_0
167 |   - marshmallow=3.20.2=pyhd8ed1ab_0
168 |   - mkl=2022.1.0=h84fe81f_915
169 |   - mkl-devel=2022.1.0=ha770c72_916
170 |   - mkl-include=2022.1.0=h84fe81f_915
171 |   - mpc=1.3.1=h24ddda3_1
172 |   - mpfr=4.2.1=h90cbb55_3
173 |   - mpmath=1.3.0=pyhd8ed1ab_1
174 |   - multidict=6.1.0=py312h178313f_2
175 |   - multiprocess=0.70.16=py312h66e93f0_1
176 |   - mypy_extensions=1.0.0=pyha770c72_1
177 |   - ncurses=6.5=h2d0b736_2
178 |   - ndjson=0.3.1=pyhd8ed1ab_0
179 |   - nest-asyncio=1.6.0=pyhd8ed1ab_1
180 |   - nettle=3.6=he412f7d_0
181 |   - networkx=3.4.2=pyh267e887_2
182 |   - nltk=3.9.1=pyhd8ed1ab_1
183 |   - numpy=1.26.4=py312heda63a1_0
184 |   - olefile=0.47=pyhd8ed1ab_1
185 |   - openh264=2.1.1=h780b84a_0
186 |   - openjpeg=2.5.2=h488ebb8_0
187 |   - openssl=3.4.0=h7b32b05_1
188 |   - orc=2.0.1=h17fec99_1
189 |   - orderly-set=5.2.3=pyh29332c3_1
190 |   - orjson=3.10.14=py312h12e396e_0
191 |   - packaging=23.2=pyhd8ed1ab_0
192 |   - pandas=2.2.3=py312hf9745cd_1
193 |   - pillow=10.3.0=py312hdcec9eb_0
194 |   - pip=24.3.1=pyh8b19718_2
195 |   - propcache=0.2.1=py312h66e93f0_0
196 |   - psutil=6.1.1=py312h66e93f0_0
197 |   - pthread-stubs=0.4=hb9d3cd8_1002
198 |   - pyarrow=16.1.0=py312h8da182e_1
199 |   - pyarrow-core=16.1.0=py312h5429d62_1_cpu
200 |   - pycparser=2.22=pyh29332c3_1
201 |   - pydantic=2.9.2=pyhd8ed1ab_0
202 |   - pydantic-core=2.23.4=py312h12e396e_0
203 |   - pypdf=5.1.0=pyha770c72_1
204 |   - pysocks=1.7.1=pyha55dd90_7
205 |   - python=3.12.3=hab00c5b_0_cpython
206 |   - python-dateutil=2.8.2=pyhd8ed1ab_0
207 |   - python-iso639=2024.10.22=pyhff2d567_1
208 |   - python-magic=0.4.27=pyh9ac5cc3_5
209 |   - python-oxmsg=0.0.1=pyhff2d567_1
210 |   - python-tzdata=2024.2=pyhd8ed1ab_1
211 |   - python-xxhash=3.5.0=py312h66e93f0_1
212 |   - python_abi=3.12=5_cp312
213 |   - pytorch=2.5.1=py3.12_cuda12.4_cudnn9.1.0_0
214 |   - pytorch-cuda=12.4=hc786d27_7
215 |   - pytorch-mutex=1.0=cuda
216 |   - pytz=2024.1=pyhd8ed1ab_0
217 |   - pyyaml=6.0.2=py312h66e93f0_1
218 |   - rapidfuzz=3.11.0=py312h2ec8cdc_0
219 |   - re2=2023.09.01=h7f4b329_2
220 |   - readline=8.2=h8228510_1
221 |   - regex=2023.12.25=py312h98912ed_0
222 |   - requests=2.32.3=pyhd8ed1ab_1
223 |   - requests-toolbelt=1.0.0=pyhd8ed1ab_1
224 |   - s2n=1.4.15=he19d79f_0
225 |   - safetensors=0.5.2=py312h12e396e_0
226 |   - scikit-learn=1.6.1=py312h7a48858_0
227 |   - scipy=1.15.1=py312h180e4f1_0
228 |   - sentence-transformers=3.3.1=pyhd8ed1ab_1
229 |   - setuptools=75.8.0=pyhff2d567_0
230 |   - six=1.16.0=pyhd8ed1ab_1
231 |   - snappy=1.2.1=h8bd8927_1
232 |   - sniffio=1.3.1=pyhd8ed1ab_1
233 |   - soupsieve=2.5=pyhd8ed1ab_1
234 |   - sqlalchemy=2.0.37=py312h66e93f0_0
235 |   - tabulate=0.9.0=pyhd8ed1ab_2
236 |   - tbb=2021.13.0=hceb3a55_1
237 |   - tenacity=9.0.0=pyhd8ed1ab_1
238 |   - threadpoolctl=3.5.0=pyhc1e730c_0
239 |   - tk=8.6.13=noxft_h4845f30_101
240 |   - tokenizers=0.21.0=py312h8360d73_0
241 |   - torchaudio=2.5.1=py312_cu124
242 |   - torchtriton=3.1.0=py312
243 |   - torchvision=0.20.1=py312_cu124
244 |   - tqdm=4.67.1=pyhd8ed1ab_1
245 |   - transformers=4.48.0=pyhd8ed1ab_0
246 |   - typing-extensions=4.12.2=hd8ed1ab_1
247 |   - typing_extensions=4.12.2=pyha770c72_1
248 |   - typing_inspect=0.9.0=pyhd8ed1ab_1
249 |   - tzdata=2024b=hc8b5060_0
250 |   - unstructured=0.16.12=pyhd8ed1ab_0
251 |   - unstructured-client=0.28.1=pyhd8ed1ab_1
252 |   - urllib3=1.26.18=pyhd8ed1ab_0
253 |   - webencodings=0.5.1=pyhd8ed1ab_3
254 |   - wheel=0.45.1=pyhd8ed1ab_1
255 |   - wrapt=1.17.1=py312h66e93f0_0
256 |   - xorg-libxau=1.0.12=hb9d3cd8_0
257 |   - xorg-libxdmcp=1.1.5=hb9d3cd8_0
258 |   - xxhash=0.8.2=hd590300_0
259 |   - xz=5.6.3=hbcc6ac9_1
260 |   - xz-gpl-tools=5.6.3=hbcc6ac9_1
261 |   - xz-tools=5.6.3=hb9d3cd8_1
262 |   - yaml=0.2.5=h7f98852_2
263 |   - yarl=1.18.3=py312h66e93f0_0
264 |   - zlib=1.2.13=h4ab18f5_6
265 |   - zstandard=0.23.0=py312hef9b889_1
266 |   - zstd=1.5.6=ha6fb4c9_0
267 |   - pip:
268 |       - accelerate==1.2.1
269 |       - aiofiles==23.2.1
270 |       - bitsandbytes==0.45.0
271 |       - coloredlogs==15.0.1
272 |       - fastapi==0.115.6
273 |       - fastembed==0.5.0
274 |       - ffmpy==0.5.0
275 |       - flatbuffers==24.12.23
276 |       - gradio==5.12.0
277 |       - gradio-client==1.5.4
278 |       - grpcio==1.69.0
279 |       - grpcio-tools==1.69.0
280 |       - httpx-sse==0.4.0
281 |       - humanfriendly==10.0
282 |       - langchain-community==0.3.14
283 |       - loguru==0.7.3
284 |       - markdown-it-py==3.0.0
285 |       - markupsafe==2.1.5
286 |       - mdurl==0.1.2
287 |       - mmh3==4.1.0
288 |       - onnx==1.17.0
289 |       - onnxruntime==1.20.1
290 |       - portalocker==2.10.1
291 |       - protobuf==5.29.3
292 |       - py-rust-stemmers==0.1.3
293 |       - pydantic-settings==2.7.1
294 |       - pydub==0.25.1
295 |       - pygments==2.19.1
296 |       - python-dotenv==1.0.1
297 |       - python-multipart==0.0.20
298 |       - qdrant-client==1.12.2
299 |       - rich==13.9.4
300 |       - ruff==0.9.1
301 |       - safehttpx==0.1.6
302 |       - semantic-version==2.10.0
303 |       - shellingham==1.5.4
304 |       - starlette==0.41.3
305 |       - sympy==1.13.1
306 |       - tomlkit==0.13.2
307 |       - trl==0.13.0
308 |       - typer==0.15.1
309 |       - uvicorn==0.34.0
310 |       - websockets==14.1
311 | 


--------------------------------------------------------------------------------